In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import metrics

pd.set_option("display.max_columns", None)

In [None]:
# load dataset
data = pd.read_json('/content/drive/MyDrive/Accountstory/data/TRAINING_dataset.json')
data

# Logistic regression for numerical values

* optimize prediction threshold

In [None]:
def log_reg(feature):
  X, y = np.array(data[feature]).reshape(-1, 1), np.array(data['stakeholder_category'])
  trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
  clf = LogisticRegression(random_state=0)
  clf.fit(trainX, trainy)

  score = 0.5
  for i in np.linspace(0,1,21):
    decisions = (clf.predict_proba(testX) >= i).astype(int)
    score_temp = sum(abs(decisions[:,0]-testy))/len(testy)
    if score_temp > score:
      #print(i, 'y',score_temp,score)
      score = score_temp
      thresh = i
  return score, thresh

In [None]:
features = ['uni_rank', 'degree_score', 'month_of_service', 'skills_count', 'company_count']

for feat in features:
  s, t = log_reg(feat)
  print(round(s,2),feat)

0.63 uni_rank
0.6 degree_score
0.51 month_of_service
0.54 skills_count
0.61 company_count


* multivariate

In [None]:
def log_reg():
  X, y = np.array(data[features]).reshape(-1, len(features)), np.array(data['stakeholder_category'])
  trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
  clf = LogisticRegression(random_state=0)
  clf.fit(trainX, trainy)

  score = 0.5
  for i in np.linspace(0,1,21):
    decisions = (clf.predict_proba(testX) >= i).astype(int)
    score_temp = sum(abs(decisions[:,0]-testy))/len(testy)
    if score_temp > score:
      #print(i, 'y',score_temp,score)
      score = score_temp
      thresh = i
  return score, thresh

In [None]:
log_reg()

(0.6293279022403259, 0.4)

# Counting for binary values (naive approach)

In [None]:
def count(feature):
  score = sum(abs(np.array(data[feature])-np.array(data['stakeholder_category'])))/len(np.array(data['stakeholder_category']))
  return score

In [None]:
features = ['sub_role_software', 'us_name', 'company_global', 'level_manager']

for feat in features:
  s = count(feat)
  print(round(s,2),feat)

0.55 sub_role_software
0.55 us_name
0.56 company_global
0.57 level_manager


* gender

In [None]:
cnt_m_DM, cnt_f_DM, cnt_m_nDM, cnt_f_nDM = 0, 0, 0, 0
for i in range(len(data['gender'])):
  if data['gender'][i] == 1 and data['stakeholder_category'][i] == 0:
    cnt_m_nDM += 1
  if data['gender'][i] == -1 and data['stakeholder_category'][i] == 0:
    cnt_f_nDM += 1
  if data['gender'][i] == 1 and data['stakeholder_category'][i] == 1:
    cnt_m_DM += 1
  if data['gender'][i] == -1 and data['stakeholder_category'][i] == 1:
    cnt_f_DM += 1

print(cnt_m_DM/(cnt_m_nDM + cnt_m_DM),
      cnt_f_DM/(cnt_f_nDM + cnt_f_DM))

0.4274447949526814 0.5991735537190083


# SVM

In [None]:
def svm_reg(feature, C, deg):
  X, y = np.array(data[feature]).reshape(-1, 1), np.array(data['stakeholder_category'])
  trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
  clf = make_pipeline(
      StandardScaler(), 
      SVC(gamma='auto',
          C=C,
          degree = deg,
          decision_function_shape = 'ovr'
          ))
  clf.fit(trainX, trainy)

  return clf.score(testX, testy)

* each feature

In [None]:
features = ['uni_rank', 'degree_score', 'month_of_service', 'skills_count', 'company_count']

for feat in features:
  score = 0
  for c in range(1, 40):
    print(c, score)
    for d in range(40):
      score = max(score, svm_reg(feat, c, d))
  print(feat, round(score,3),feat)

* multivariable

In [None]:
def svm_reg_mult(C, deg):
  X, y = np.array(data[features]).reshape(-1, len(features)), np.array(data['stakeholder_category'])
  trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.7, random_state=2, stratify=y)
  clf = make_pipeline(
      StandardScaler(), 
      SVC(gamma='auto',
          C=C,
          degree = deg,
          decision_function_shape = 'ovr'
          ))
  clf.fit(trainX, trainy)
  return clf.score(testX, testy)

In [None]:
score = 0
for c in range(1, 40):
  print(c, score)
  for d in range(40):
    score = max(score, svm_reg_mult(c, d))

score