In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
seed=1
models = [
#             'ADB',
            'GBC',
            'RFC',
            'KNC',
            'SVC',
            'logisticRegression'
         ]
clfs = [
#         AdaBoostClassifier(random_state=seed),
        GradientBoostingClassifier(random_state=seed),
        RandomForestClassifier(random_state=seed,n_jobs=-1),
        KNeighborsClassifier(n_jobs=-1),
        SVC(random_state=seed,probability=True),
        LogisticRegression(solver='newton-cg', multi_class='multinomial')
        ]
params = {
#             models[0]:{'learning_rate':[0.01], 'n_estimators':[150]},
            models[0]:{'learning_rate':[0.01],'n_estimators':[100], 'max_depth':[3],
                       'min_samples_split':[2],'min_samples_leaf': [2]},
            models[1]:{'n_estimators':[100], 'criterion':['gini'],'min_samples_split':[2],
                      'min_samples_leaf': [4]},
            models[2]:{'n_neighbors':[5], 'weights':['distance'],'leaf_size':[15]},
            models[3]: {'C':[100], 'tol': [0.005],
                       'kernel':['sigmoid']},
            models[4]: {'C':[2000], 'tol': [0.0001]}
         }

In [16]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [3]:
y_test = 0
test_scores = []

In [4]:
df = pd.read_csv('/home/ifte-home/Documents/mental_health/suicide/suicide_stories_to_help.csv')

In [15]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,main_text,is_suicide,processed_text
0,0,0,need help hi really know phrase situation try ...,0,need help hi realli know phrase situat tri lif...
1,1,1,feeling overwhelmed hopeless depressed past co...,1,feel overwhelm hopeless depress past coupl wee...
2,2,2,nothing matter anymore getting worse hi know e...,0,noth matter anymor get wors hi know els go dev...
3,3,3,tired hearing bullshit shit like get better ev...,1,tire hear bullshit shit like get better everyo...
4,4,4,wish wa someone else wish wa prettier wish fee...,0,wish wa someon els wish wa prettier wish feel ...


In [5]:
# Dropping rows where NaN is present
df = df.dropna(subset=['processed_text'])

In [6]:
pipeline = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())])

vectorized_data = pipeline.fit_transform(df['processed_text'])

n_x_train, n_x_val, n_y_train, n_y_val = train_test_split(vectorized_data.toarray(), 
                                                    df.is_suicide, test_size=0.2, 
                                                    random_state=0)

In [26]:
for name, clf in zip(names,classifiers):
    print(name)
    clf.fit(n_x_train, n_y_train)
    score = clf.score(n_x_val, n_y_val)

#     print("best params: " + str(clf.best_params_))
#     print("best scores: " + str(clf.best_score_))
#     estimates = clf.predict_proba(n_x_val)
    acc = accuracy_score(n_y_val, clf.predict(n_x_val))
    print("Accuracy: {:.4%}".format(acc))

Nearest Neighbors
Accuracy: 62.9333%
Linear SVM
Accuracy: 51.2000%
RBF SVM
Accuracy: 70.9333%
Gaussian Process
Accuracy: 70.4000%
Decision Tree
Accuracy: 62.4000%
Random Forest
Accuracy: 50.4000%
Neural Net
Accuracy: 70.4000%
AdaBoost
Accuracy: 66.9333%
Naive Bayes
Accuracy: 53.8667%
QDA




Accuracy: 48.5333%


In [31]:
for name, estimator in zip(models,clfs):
    print(name)
    clf = GridSearchCV(estimator, params[name], refit='True', n_jobs=-1, cv=5)
    clf.fit(n_x_train, n_y_train)

    print("best params: " + str(clf.best_params_))
    print("best scores: " + str(clf.best_score_))
    estimates = clf.predict_proba(n_x_val)
    acc = accuracy_score(n_y_val, clf.predict(n_x_val))
    print("Accuracy: {:.4%}".format(acc))
    
    test_scores.append((acc,clf.best_score_))
    
#     submission = pd.DataFrame(estimates, index=test_ids, columns=le.classes_)
#     submission.to_csv('./'+name+'.csv')

GBC
best params: {'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
best scores: 0.6877614269788184
Accuracy: 64.2667%
RFC
best params: {'criterion': 'gini', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
best scores: 0.6771014492753624
Accuracy: 69.6000%
KNC
best params: {'leaf_size': 15, 'n_neighbors': 5, 'weights': 'distance'}
best scores: 0.6204102564102565
Accuracy: 61.0667%
SVC
best params: {'C': 100, 'kernel': 'sigmoid', 'tol': 0.005}
best scores: 0.6037279821627648
Accuracy: 66.1333%
logisticRegression
best params: {'C': 2000, 'tol': 0.0001}
best scores: 0.6550903010033444
Accuracy: 68.5333%


In [56]:
param_grid = {'kernel': ['rbf'] , 
              'gamma': [0.2, 0.4, 0.7, 2], 
              'degree' : [2]}

clf = GridSearchCV(SVC() , param_grid , refit=True , verbose=3)
clf.fit(n_x_train, n_y_train)

print("best params: " + str(clf.best_params_))
print("best scores: " + str(clf.best_score_))
estimates = clf.predict_proba(n_x_val)
acc = accuracy_score(n_y_val, clf.predict(n_x_val))
print('Predict_prob', estimates , "Accuracy: {:.4%}".format(acc))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ...degree=2, gamma=0.2, kernel=rbf;, score=0.663 total time=   7.3s
[CV 2/5] END ...degree=2, gamma=0.2, kernel=rbf;, score=0.733 total time=   7.4s
[CV 3/5] END ...degree=2, gamma=0.2, kernel=rbf;, score=0.727 total time=   7.5s
[CV 4/5] END ...degree=2, gamma=0.2, kernel=rbf;, score=0.677 total time=   7.3s
[CV 5/5] END ...degree=2, gamma=0.2, kernel=rbf;, score=0.689 total time=   7.3s
[CV 1/5] END ...degree=2, gamma=0.4, kernel=rbf;, score=0.670 total time=   7.2s
[CV 2/5] END ...degree=2, gamma=0.4, kernel=rbf;, score=0.727 total time=   7.3s
[CV 3/5] END ...degree=2, gamma=0.4, kernel=rbf;, score=0.743 total time=   7.3s
[CV 4/5] END ...degree=2, gamma=0.4, kernel=rbf;, score=0.680 total time=   7.2s
[CV 5/5] END ...degree=2, gamma=0.4, kernel=rbf;, score=0.696 total time=   7.2s
[CV 1/5] END ...degree=2, gamma=0.7, kernel=rbf;, score=0.663 total time=   7.7s
[CV 2/5] END ...degree=2, gamma=0.7, kernel=rbf;,

AttributeError: predict_proba is not available when  probability=False

In [13]:
# estimates = clf.predict_proba(n_x_val)
acc = accuracy_score(n_y_val, clf.predict(n_x_val))
print("Accuracy: {:.4%}".format(acc))

Accuracy: 69.6000%


In [19]:
print(clf.best_estimator_)
grid_predictions = clf.predict(n_x_val)
print(confusion_matrix(n_y_val,grid_predictions))
print(classification_report(n_y_val,grid_predictions))

SVC(C=3, degree=2, gamma=0.7)
[[122  61]
 [ 53 139]]
              precision    recall  f1-score   support

           0       0.70      0.67      0.68       183
           1       0.69      0.72      0.71       192

    accuracy                           0.70       375
   macro avg       0.70      0.70      0.70       375
weighted avg       0.70      0.70      0.70       375



In [24]:
df = pd.read_csv('/home/ifte-home/Downloads/result.csv')

In [25]:
df

Unnamed: 0,a,b,c,d,e,f
0,[CV 1/5] END C=5,degree=2,gamma=0.1,kernel=rbf;,0.660,9.5s
1,[CV 2/5] END C=5,degree=2,gamma=0.1,kernel=rbf;,0.703,7.0s
2,[CV 3/5] END C=5,degree=2,gamma=0.1,kernel=rbf;,0.737,6.9s
3,[CV 4/5] END C=5,degree=2,gamma=0.1,kernel=rbf;,0.683,6.7s
4,[CV 5/5] END C=5,degree=2,gamma=0.1,kernel=rbf;,0.682,6.8s
...,...,...,...,...,...,...
331,[CV 2/5] END C=100,degree=4,gamma=0.2,kernel=rbf;,0.657,6.5s
332,[CV 3/5] END C=100,degree=4,gamma=0.2,kernel=rbf;,0.717,6.9s
333,[CV 4/5] END C=100,degree=4,gamma=0.2,kernel=rbf;,0.643,6.3s
334,[CV 5/5] END C=100,degree=4,gamma=0.2,kernel=rbf;,0.659,6.5s


In [61]:
count_df.groupby(['c'])['c'].count()

c
 gamma=0.1    14
 gamma=0.2    11
 gamma=0.4    16
 gamma=0.7    17
Name: c, dtype: int64

In [58]:
count_df = df[df.e > 0.7]

In [59]:
count_df = count_df.reset_index(drop=True)

In [60]:
count_df

Unnamed: 0,a,b,c,d,e,f
0,[CV 2/5] END C=5,degree=2,gamma=0.1,kernel=rbf;,0.703,7.0s
1,[CV 3/5] END C=5,degree=2,gamma=0.1,kernel=rbf;,0.737,6.9s
2,[CV 3/5] END C=5,degree=2,gamma=0.2,kernel=rbf;,0.71,7.4s
3,[CV 3/5] END C=5,degree=2,gamma=0.4,kernel=rbf;,0.717,7.6s
4,[CV 2/5] END C=5,degree=2,gamma=0.4,kernel=poly;,0.747,6.2s
5,[CV 3/5] END C=5,degree=2,gamma=0.4,kernel=poly;,0.73,6.7s
6,[CV 3/5] END C=5,degree=2,gamma=0.7,kernel=rbf;,0.73,6.9s
7,[CV 2/5] END C=5,degree=2,gamma=0.7,kernel=poly;,0.723,6.5s
8,[CV 3/5] END C=5,degree=2,gamma=0.7,kernel=poly;,0.747,6.4s
9,[CV 2/5] END C=5,degree=3,gamma=0.1,kernel=rbf;,0.703,6.2s
