<a href="https://colab.research.google.com/github/JVerbeek/AML/blob/main/AML_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
seed = 3141592
random.seed(seed)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/train_data.csv")
df.shape

(1593, 2651)

In [None]:
mean_keys = []
median_keys = []
minmax_keys = []
nonzero_keys = []
total_keys = []
std_keys = []
rest_keys = []
keysets = [mean_keys, median_keys, minmax_keys, nonzero_keys, total_keys, std_keys, rest_keys]

In [None]:
for key in df.keys():
  if "mean" in key:
    mean_keys.append(key)
  elif "median" in key:
    median_keys.append(key)
  elif "non_zero_calls" in key:
    nonzero_keys.append(key)
  elif "min" in key or "max" in key:
    minmax_keys.append(key)
  elif "total" in key:
    total_keys.append(key)
  elif "std" in key:
    std_keys.append(key)
  else:
    rest_keys.append(key)

In [None]:
df[mean_keys].head()
for key in rest_keys:
  print(key)

opportunity_id
current_date_day
current_date_month
current_date_year
CRM__days_since_last_crm_amount_change
QUESTIONS__non_zero_num_of_questions_in_calls_by_company__from_initial_high_interaction_day
LENGTH__non_zero_call_duration__from_current_stage_start
RESPONSE_TIME__non_zero_email_customer_response_time__last_28_days
PARTICIPANTS__customer_num_of_participants__from_initial_high_interaction_day
RESPONSE_TIME__num_company_emails_with_no_response_in_timeframe_window__last_7_days
LENGTH__non_zero_time_spoken_percent_company__last_7_days
LENGTH__non_zero_call_duration__last_28_days
QUESTIONS__non_zero_num_of_questions_in_calls_by_customer__last_7_days
LENGTH__non_zero_time_spoken_percent_non_company__from_current_stage_start
RESPONSE_TIME__non_zero_email_company_response_time__last_21_days
QUESTIONS__non_zero_num_of_questions_in_calls__from_current_stage_start
COMMUNICATION_ACTIVITY__company_to_customer_emails_ratio__from_opp_creation
LENGTH__non_zero_call_duration__last_21_days
PARTIC

In [None]:
key_list = []
for key in [0, 1, 2]:
  filter_df = df[df["target"] == key]
  cond = filter_df.isna().sum() == 0   # no NAs
  print(filter_df.loc[:,cond].isna().sum())
  key_list.append(filter_df.loc[:,cond].keys())
  #print(filter_df.describe())
c1 = key_list[0]
c2 = key_list[1]
c3 = key_list[2]
unique_c2 = [c for c in c2 if c not in c1]
unique_c1 = [c for c in c1 if c not in c3]
print(unique_c2, unique_c1)

opportunity_id                                                       0
current_date_day                                                     0
current_date_month                                                   0
current_date_year                                                    0
QUESTIONS__std_num_of_questions_in_calls_by_customer__last_7_days    0
                                                                    ..
opportunity_stage_at_time_of_weekstart_max                           0
opportunity_stage_at_time_of_weekstart_min                           0
current_quarter                                                      0
timetoclose                                                          0
target                                                               0
Length: 414, dtype: int64
opportunity_id                                                       0
current_date_day                                                     0
current_date_month                                 

So class 0 and class 2 have NAs in the same columns except for `CRM__current_crm_amount`, only class 2 has that column. This feature is bound to be informative for separating class 0 from class 2 then. 

Class 1 has 65 columns that are unique to that class in the sense that the other classes do not have them.
If we examine these 65 columns, they all have to do with whether the customer had a high interaction day. Let's check how many columns we have with `high_interaction_day` in them. 

In [None]:
len([key for key in df.keys() if "initial_high_interaction_day" in key])

484

The target column appears last. Let's check it out:

In [None]:
def make_hist(title, data):
  plt.hist(data)
  plt.ylabel("no. samples")
  plt.xlabel("value")
  plt.title(f"Histogram of {title}")
  plt.show()

This is apparently a classification problem. 
The classes are not perfectly balanced, but it is not so extreme that I would say we need to balance the classes. We should probably stratify if we K-fold, though.

### Dealing with NAs
There are quite a number of columns that have many NaN values. For now we drop these and use the columns that do have values. 
We may want to impute these values later on. 

In [None]:
y = df["target"]
X = df.drop("target", axis=1)

In [None]:
isnas_500 = X.isna().sum() < 500   # More than half of the values are NA then drop
isnas = X.isna().sum() == 0
df_few_na = X.loc[:, isnas_500.values]
df_no_na = X.loc[:, isnas.values]
few_na_keys = df_few_na.keys()
print(sum(X.isna().sum() < 500))

583


#### Impute missing values iteratively
This is a pretty terrible idea, it takes a small infinity but whatever

In [None]:
categorical = []
continuous = []
for key in df_few_na.keys():
  if len(np.unique(df[key].values)) < 10: # If fewer than 5 unique values then data is probably categorical
    categorical.append(key)
  else: 
    continuous.append(key)
    #make_hist(key, df_no_na[key])

print(len(categorical))

53


In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
df_few_na[continuous] = imp.fit_transform(df_few_na[continuous])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [None]:
for key in categorical:
  df_few_na[key] = df_few_na[key].fillna(df_few_na[key].value_counts().idxmax())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
df_few_na.isna().sum().sum()

0

In [None]:
df = pd.DataFrame(df_few_na, columns=few_na_keys)
df.describe()

Unnamed: 0,opportunity_id,current_date_day,current_date_month,current_date_year,QUESTIONS__std_num_of_questions_in_calls_by_customer__last_7_days,RESPONSE_TIME__total_email_customer_response_time__last_14_days,QUESTIONS__std_num_of_questions_in_calls_by_company__from_opp_creation,TOPICS__std_calls_Discovery__from_opp_creation,COMMUNICATION_ACTIVITY__total_emails__last_7_days,TOPICS__std_calls_Small_Talk__from_opp_creation,...,overall_rank,sort_order_at_time_of_weekstart,opportunity_probability_at_time_of_weekstart,opportunity_amount_at_time_of_weekstart,fc_transition,opportunity_stage_at_time_of_weekstart_max,opportunity_stage_at_time_of_weekstart_min,stage_transition,current_quarter,timetoclose
count,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,...,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0,1593.0
mean,4.375875e+18,16.441933,7.168236,2020.433773,1.507096,50.910965,6.208278,505.800901,7.748901,596.061926,...,9.475204,7.887633,64.271814,31661.2055,0.175769,28.906466,26.104834,3.318925,2.686127,49.006905
std,2.493243e+18,8.594175,3.201259,0.820307,3.946708,81.100143,5.808861,310.594924,11.250169,194.886862,...,9.424431,6.810652,34.871041,51178.210159,0.504244,10.251028,10.825346,7.547985,1.054145,63.32884
min,1.542746e+16,1.0,1.0,2018.0,0.0,0.0,0.0,-286.083719,0.0,16.15975,...,1.0,0.0,0.0,0.0,-3.0,0.0,0.0,-34.0,1.0,-14.0
25%,2.454665e+18,9.0,5.0,2020.0,0.0,0.0,0.0,277.54901,0.0,469.764038,...,3.0,1.0,30.0,10000.0,0.0,21.0,17.0,0.0,2.0,9.0
50%,3.809525e+18,17.0,7.0,2021.0,0.0,8.380833,6.015605,561.639781,3.0,568.242026,...,6.0,6.0,75.0,15000.0,0.0,32.0,27.0,0.0,3.0,25.0
75%,6.652569e+18,24.0,10.0,2021.0,0.0,76.248889,10.356158,746.618336,12.0,744.804098,...,13.0,15.0,95.0,37800.0,0.0,40.0,34.0,8.0,4.0,62.0
max,9.207329e+18,31.0,12.0,2021.0,38.0,670.687778,30.711358,1286.583063,124.0,1286.583063,...,57.0,18.0,100.0,728750.0,2.0,40.0,40.0,32.0,4.0,327.0


In [None]:
corr_keys = []
for key in df.keys():
  if np.corrcoef(df[key], y=y)[0, 1] > 0.25:
    corr_keys.append(key)
print(len(corr_keys))

15


  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:

for i, key in enumerate(categorical):
  le = LabelEncoder()
  df[key] = le.fit_transform(df[key])

ss = StandardScaler()
df[continuous] = ss.fit_transform(df[continuous])

In [None]:
df.head()

Unnamed: 0,opportunity_id,current_date_day,current_date_month,current_date_year,QUESTIONS__std_num_of_questions_in_calls_by_customer__last_7_days,RESPONSE_TIME__total_email_customer_response_time__last_14_days,QUESTIONS__std_num_of_questions_in_calls_by_company__from_opp_creation,TOPICS__std_calls_Discovery__from_opp_creation,COMMUNICATION_ACTIVITY__total_emails__last_7_days,TOPICS__std_calls_Small_Talk__from_opp_creation,...,opportunity_amount_at_time_of_weekstart,forecast_category_at_time_of_weekstart_1_Commit,forecast_category_at_time_of_weekstart_3_Best_Case,forecast_category_at_time_of_weekstart_4_Pipeline,fc_transition,opportunity_stage_at_time_of_weekstart_max,opportunity_stage_at_time_of_weekstart_min,stage_transition,current_quarter,timetoclose
0,-1.749455,-1.68096,1.197331,3,-0.381982,-0.170075,-1.069096,-1.015813,-0.511166,-0.385421,...,-0.248293,0,0,0,2,-0.576364,-0.286902,-0.439848,3,-0.331816
1,-1.749455,-0.8662,1.197331,3,-0.381982,-0.605482,-1.069096,-1.203826,0.911481,-0.607597,...,0.210766,0,0,0,2,1.082527,1.28398,1.813116,3,-0.442385
2,-1.734039,-0.633411,0.884856,3,-0.381982,0.958374,-1.069096,-0.717659,1.444974,-1.21008,...,-0.595004,0,0,0,2,1.082527,1.28398,-0.439848,3,-0.60034
3,-1.730028,0.996111,-0.05257,3,-0.381982,-0.051011,0.050236,-0.689281,-0.066589,-1.561812,...,-0.427293,0,1,0,3,1.082527,-1.303354,-1.102484,2,-0.695113
4,-1.730028,-1.797355,0.259906,3,1.171258,0.223592,0.027621,-0.704303,1.089312,-1.585752,...,-0.426875,0,0,0,2,1.082527,1.28398,-0.439848,2,-0.789887


### PCA?

In [None]:
pca = PCA(n_components=5)
pca.fit(df[continuous].values)

In [None]:
explained_variance = pca.explained_variance_ratio_
print(f"{sum(explained_variance[:5]) * 100}% of the variance is explained by the first 5 PCs")
plt.bar(np.arange(len(explained_variance)), explained_variance)
plt.show()

Examine singular values of first PC:

In [None]:
[U, S, V_t] = np.linalg.svd(df[continuous].values)
V = V_t.transpose()
plt.bar(np.arange(V.shape[1]), V[:,0])
plt.show()

plt.bar(np.arange(V.shape[1]), V[:,1])
plt.show()

In [None]:
Y = df[continuous].values
Z = np.dot(Y, V[:,:5])
Z0 = Z[y==0]
Z1 = Z[y==1]
Z2 = Z[y==2]
scatter_pc1_pc2 = plt.figure()

for (i, j) in [(0, 1), (1, 2), (2, 3)]:
  plt.scatter(Z1[:,i],Z1[:,j])
  plt.scatter(Z2[:,i],Z2[:,j])
  plt.scatter(Z0[:,i],Z0[:,j])
  
  plt.show()

In [None]:
# import plotly.express as px

# px.scatter_3d(Z, Z[:,0], Z[:,1], Z[:,2], color = y, size=y)

In [None]:
pca_data = pca.fit_transform(df[continuous].values)
pca_data_df = pd.DataFrame(pca_data)

### Try some mRMR filtering

In [None]:
!pip install mrmr_selection

In [None]:
import mrmr

### Some rather uninteresting stuff, such as the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import KFold, StratifiedKFold


In [None]:
cont = df[continuous].values - df[continuous].values.mean()
cat = df[categorical].values
data = np.concatenate((cont, cat), axis=1)
print(data.shape)

(1593, 583)


In [None]:
splits = 10
kf = StratifiedKFold(splits)

acc_est = np.zeros((splits, 1))
for i, (train_index, test_index) in enumerate(kf.split(data, y=y)):
    X_train, X_test = data.loc[train_index], data.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #selected_features = mrmr.mrmr_classif(X=X_train, y=y_train, K=k)  # Empirically linear in k, approximately
    #X_train = X_train[selected_features]
    #X_test = X_test[selected_features]
    clf = RandomForestClassifier(n_estimators=100)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc_est[i, j] = metrics.accuracy_score(y_test, y_pred)
    print(acc_est[i])
print(acc_est.mean(axis=0), f"k = {k}")

AttributeError: ignored

In [None]:

X_train, X_test, y_train, y_test = train_test_split(data, y, stratify=y)
  # pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA()), ('rf', RandomForestClassifier())])
  # pipe.fit(X_train, y_train)
  # score = pipe.score(X_test, y_test)
  # print(score)

In [None]:
from sklearn.model_selection import GridSearchCV
pipe = make_pipeline(StandardScaler(), RandomForestClassifier(criterion='gini'))
parameters = {'randomforestclassifier__n_estimators': [50, 100, 500], 'randomforestclassifier__max_depth': [100, 500]}
gridsearch = GridSearchCV(pipe, parameters, cv=10, verbose=3)
gridsearch.fit(X_train, y_train)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV 1/10] END randomforestclassifier__max_depth=100, randomforestclassifier__n_estimators=50;, score=0.783 total time=   0.6s
[CV 2/10] END randomforestclassifier__max_depth=100, randomforestclassifier__n_estimators=50;, score=0.833 total time=   0.6s
[CV 3/10] END randomforestclassifier__max_depth=100, randomforestclassifier__n_estimators=50;, score=0.858 total time=   0.6s
[CV 4/10] END randomforestclassifier__max_depth=100, randomforestclassifier__n_estimators=50;, score=0.783 total time=   0.6s
[CV 5/10] END randomforestclassifier__max_depth=100, randomforestclassifier__n_estimators=50;, score=0.798 total time=   0.6s
[CV 6/10] END randomforestclassifier__max_depth=100, randomforestclassifier__n_estimators=50;, score=0.790 total time=   0.6s
[CV 7/10] END randomforestclassifier__max_depth=100, randomforestclassifier__n_estimators=50;, score=0.815 total time=   0.6s
[CV 8/10] END randomforestclassifier__max_depth=100, rand

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier())]),
             param_grid={'randomforestclassifier__max_depth': [100, 500],
                         'randomforestclassifier__n_estimators': [50, 100,
                                                                  500]},
             verbose=3)

In [None]:
y_pred = gridsearch.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.7919799498746867
[[ 25  14  23]
 [  8  49  22]
 [  3  13 242]]
              precision    recall  f1-score   support

           0       0.69      0.40      0.51        62
           1       0.64      0.62      0.63        79
           2       0.84      0.94      0.89       258

    accuracy                           0.79       399
   macro avg       0.73      0.65      0.68       399
weighted avg       0.78      0.79      0.78       399



In [None]:
pd.DataFrame(X_test[(y_pred) == 2 & (y_test == 0)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,573,574,575,576,577,578,579,580,581,582
0,6.583047285611057e+18,-8256368250469581.0,-8256368250469578.0,-8256368250469587.0,-8256368250469587.0,-8256368250469578.0,-8256368250469123.0,-8256368250469587.0,-8256368250469123.0,-8256368250469587.0,...,0.0,0.0,0.0,0.0,0.0,True,False,False,0.0,4
1,6.583047285611057e+18,-8256368250469582.0,-8256368250469576.0,-8256368250469587.0,-8256368250469587.0,-8256368250469578.0,-8256368250469123.0,-8256368250469587.0,-8256368250469123.0,-8256368250469587.0,...,0.0,0.0,0.0,0.0,0.0,True,False,False,0.0,4
2,3.503606332968127e+18,-8256368250469574.0,-8256368250469581.0,-8256368250469587.0,-8256368250469587.0,-8256368250469587.0,-8256368250469618.0,-8256368250469587.0,-8256368250469414.0,-8256368250469587.0,...,0.0,0.0,0.0,1.0,0.0,True,False,False,0.0,3
3,3.317339110860418e+18,-8256368250469571.0,-8256368250469578.0,-8256368250469587.0,-8256368250469587.0,-8256368250469584.0,-8256368250469070.0,-8256368250469587.0,-8256368250468888.0,-8256368250469587.0,...,0.0,0.0,0.0,0.0,0.0,True,False,False,0.0,4
4,2.3973525565161897e+18,-8256368250469576.0,-8256368250469581.0,-8256368250469587.0,-8256368250469587.0,-8256368250469587.0,-8256368250469490.0,-8256368250469587.0,-8256368250469139.0,-8256368250469582.0,...,0.0,0.0,0.0,0.0,0.0,True,False,False,0.0,3
5,2.041542591930478e+18,-8256368250469586.0,-8256368250469581.0,-8256368250469587.0,-8256368250469474.0,-8256368250469581.0,-8256368250469061.0,-8256368250469565.0,-8256368250469061.0,-8256368250469587.0,...,0.0,0.0,0.0,1.0,0.0,True,False,False,0.0,3
6,3.4362177362570834e+18,-8256368250469574.0,-8256368250469578.0,-8256368250469587.0,-8256368250469587.0,-8256368250469582.0,-8256368250469192.0,-8256368250469586.0,-8256368250469192.0,-8256368250469587.0,...,0.0,0.0,0.0,0.0,0.0,True,True,False,0.0,4
7,3.317339110860418e+18,-8256368250469584.0,-8256368250469579.0,-8256368250469587.0,-8256368250469587.0,-8256368250469584.0,-8256368250469070.0,-8256368250469587.0,-8256368250468826.0,-8256368250469586.0,...,0.0,0.0,1.0,1.0,0.0,True,False,False,0.0,3
8,2.041542591930478e+18,-8256368250469571.0,-8256368250469581.0,-8256368250469587.0,-8256368250469334.0,-8256368250469581.0,-8256368250469059.0,-8256368250469586.0,-8256368250469059.0,-8256368250469587.0,...,0.0,0.0,0.0,1.0,0.0,True,False,False,0.0,3
9,1.2827248649409595e+18,-8256368250469581.0,-8256368250469581.0,-8256368250469587.0,-8256368250469587.0,-8256368250469587.0,-8256368250469422.0,-8256368250469586.0,-8256368250469134.0,-8256368250469582.0,...,0.0,0.0,2.0,0.0,0.0,True,False,False,0.0,3
