In [None]:
import sklearn 
import random
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, make_scorer, confusion_matrix, accuracy_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv("data_final.csv")
df.head()
# print(df.columns)

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df

In [None]:
df_w_dup = df.append(df[df['ipo'] == 1] * 20, ignore_index=True)

In [None]:
np.sum(df_w_dup['ipo'])

In [None]:
features = df_w_dup.columns
features = features.drop(['name', 'ipo','funded_object_id'])

In [None]:
#scaling on features
df_w_dup[features] = StandardScaler().fit_transform(df_w_dup[features])
df_w_dup[features]

In [None]:
# computing weighted accuracy from CS4780
def weighted_accuracy(pred, true):
    assert(len(pred) == len(true))
    num_labels = len(true)
    num_pos = sum(true)
    num_neg = num_labels - num_pos
    frac_pos = num_pos/num_labels
    weight_pos = 1/frac_pos
    weight_neg = 1/(1-frac_pos)
    num_pos_correct = 0
    num_neg_correct = 0
    for pred_i, true_i in zip(pred, true):
        num_pos_correct += (pred_i == true_i and true_i == 1)
        num_neg_correct += (pred_i == true_i and true_i == 0)
    weighted_accuracy = ((weight_pos * num_pos_correct) 
                         + (weight_neg * num_neg_correct))/((weight_pos * num_pos) + (weight_neg * num_neg))
    return weighted_accuracy

#custom scorer based on weighted accuracy function given
def weighted_accuracy_switched(y, y_pred):
    return weighted_accuracy(y_pred, y)

weighted_accuracy_score = make_scorer(weighted_accuracy_switched, greater_is_better = True)

In [None]:
Y = df_w_dup['ipo']
X = df_w_dup[features]

In [None]:
#first try on knn
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

tuned_parameters = [{'n_neighbors':[1, 3, 5, 7, 9, 11, 15, 21, 35, 51], 'weights': ['uniform', 'distance']}]

knn_cv = KNeighborsClassifier(n_neighbors = 3)
knn_cv_scores = cross_val_score(knn_cv, X, Y, cv=5, scoring=weighted_accuracy_score)
# knn_cv_scores
print("Accuracy: %0.2f (+/- %0.2f)" % (knn_cv_scores.mean(), knn_cv_scores.std() * 2))

print("# Tuning KNN hyper-parameters for weighted accuracy")
print()

knn_clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, scoring=weighted_accuracy_score)
knn_clf.fit(X_train, y_train)

print("Best parameters set found on train set:")
print()
print(knn_clf.best_params_)
print()
print("Grid scores on train set:")
print()
means = knn_clf.cv_results_['mean_test_score']
stds = knn_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, knn_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full train set.")
print("The scores are computed on the full test set.")
print()
y_true, y_pred = y_test, knn_clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

In [None]:
#svm cross validation {'C': 10000.0, 'gamma': 0.05, 'kernel': 'rbf'}
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

tuned_parameters = [{'kernel': ['poly', 'rbf', 'linear', 'sigmoid'], 'gamma': [0.1,0.05,0.01], 'C': [1e-2, 1e-1, 1, 10, 100]}]

print("# Tuning SVM hyper-parameters for weighted accuracy")
print()

svm_clf = GridSearchCV(svm.SVC(), tuned_parameters, weighted_accuracy_score)
svm_clf.fit(X_train, y_train)

print("Best parameters set found on train set:")
print()
print(svm_clf.best_params_)
print()
print("Grid scores on train set:")
print()
means = svm_clf.cv_results_['mean_test_score']
stds = svm_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, svm_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full train set.")
print("The scores are computed on the full test set.")
print()
y_true, y_pred = y_test, svm_clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.5)

# regressor = RandomForestClassifier(n_estimators=20, random_state=0)
# regressor.fit(X_train, y_train)
# y_pred = regressor.predict(X_test)

# from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# rfc_cv_score = cross_val_score(regressor, X, Y, cv=10, scoring=weighted_accuracy_score)

# print("=== Confusion Matrix ===")
# print(confusion_matrix(y_test, y_pred))
# print('\n')
# print("=== Classification Report ===")
# print(classification_report(y_test, y_pred))
# print('\n')
# print("=== All AUC Scores ===")
# print(rfc_cv_score)
# print('\n')
# print("=== Mean AUC Score ===")
# print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())


In [None]:
#third try on decision tree
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

tuned_parameters = [{'n_estimators': [2, 5, 10, 20]}]

print("# Tuning Random Forest hyper-parameters for weighted accuracy")
print()

rf_clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, scoring=weighted_accuracy_score)
rf_clf.fit(X_train, y_train)

print("Best parameters set found on train set:")
print()
print(rf_clf.best_params_)
print()
print("Grid scores on train set:")
print()
means = rf_clf.cv_results_['mean_test_score']
stds = rf_clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full train set.")
print("The scores are computed on the full test set.")
print()
y_true, y_pred = y_test, rf_clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()


In [None]:
#fourth try on logistic regression with l1
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)

tuned_parameters = {'C' : np.logspace(-3,3,7), 'penalty':['l1'], 'solver':['saga']}
# penalty='l1', solver='liblinear'

print("# Tuning Logistic Regression hyper-parameters for weighted accuracy")
print()

logreg_cv = GridSearchCV(LogisticRegression(),tuned_parameters, scoring = weighted_accuracy_score)
logreg_cv.fit(X_train,y_train)

print("Best parameters set found on train set:")
print()
print(logreg_cv.best_params_)
print()
print("Grid scores on train set:")
print()
means = logreg_cv.cv_results_['mean_test_score']
stds = logreg_cv.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, logreg_cv.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full train set.")
print("The scores are computed on the full test set.")
print()
y_true, y_pred = y_test, logreg_cv.predict(X_test)
print(classification_report(y_true, y_pred))
print()


In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.3, )

In [None]:
#TODO: insert best parameter here
knn_model = KNeighborsClassifier(n_neighbors = n)
knn_model.fit(X_train, Y_train)
knn_pred = knn_model.predict(X_val)
    
print("The weighted accuracy for knn is:", weighted_accuracy(knn_pred, Y_val))

In [None]:
#TODO: insert best parameter here
svm_model = svm.SVC(C=C, gamma=,kernel=)
svm_model.fit(X_train, Y_train)
svm_pred = svm_model.predict(X_val)

print("The weighted accuracy for svm is:", weighted_accuracy(svm_pred, Y_val))

In [None]:
#TODO: insert best parameter here
rf_model =RandomForestClassifier(n_estimators=, random_state=0)
rf_model.fit(X_train, Y_train)
rf_pred = rf_model.predict(X_val)

print("The weighted accuracy for random forest is:", weighted_accuracy(rf_pred, Y_val))

In [None]:
#TODO: insert best parameter here
logit_model = LogisticRegression(penalty = 'l1', solver = 'saga',C = )
logit_model.fit(X_train, Y_train)
logit_pred = logit_model.predict(X_val)

print("The weighted accuracy for logit is:", weighted_accuracy(logit_pred, Y_val))