In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, CategoricalEncoder

import pandas as pd
import numpy as np

In [3]:
# Read in the tsv file
df1 = pd.read_csv('06693-0001-Data.tsv', delim_whitespace=True)
df2 = pd.read_csv('06693-0002-Data.tsv', delimiter='\t', na_filter=False, na_values='.')

In [6]:
DS2_suicide_factors = ['EDCAT', 'RACE', 'EMP', 'DEPREC', 'ALCALT1', 'DRGALT1']
DS1_suicide_factors = ['V12', 'V13', 'V4515', 'V5114', 'V5118', 'V5113', 
                      'V5115', 'V5112', 'V4428', 'V4433', 'V6305', 'V6301', 
                      'V5225', 'V6649', 'V6749', 'V5918', 'V6143', 'V6126', 
                      'V6114', 'V101', 'V102', 'V6215']

In [7]:
# df.shape
print(df1.shape, df2.shape)

(8098, 2954) (8098, 224)


In [8]:
for var in DS2_suicide_factors:
    print(df2[var].value_counts())

2    2679
3    2132
4    1813
1    1474
Name: EDCAT, dtype: int64
1    6084
2    1011
3     733
4     270
Name: RACE, dtype: int64
2    5979
1     840
3     719
4     560
Name: EMP, dtype: int64
     6642
4     620
1     360
2     316
3     160
Name: DEPREC, dtype: int64
0    6177
1    1921
Name: ALCALT1, dtype: int64
0    7154
1     944
Name: DRGALT1, dtype: int64


In [18]:
df1.groupby('V4515')['V4515'].agg('count')

V4515
0    3244
1    1921
2     192
3     576
4      33
9    2132
Name: V4515, dtype: int64

In [19]:
for var in DS1_suicide_factors:
    print(df1.groupby(var)[var].agg('count'))

V12
15    152
16    167
17    160
18    159
19    168
20    197
21    180
22    179
23    197
24    209
25    219
26    230
27    266
28    245
29    240
30    298
31    284
32    282
33    297
34    265
35    258
36    250
37    249
38    226
39    226
40    239
41    212
42    200
43    185
44    197
45    148
46    167
47    160
48    149
49    141
50    151
51    110
52    146
53    145
54    136
55      4
58      1
59      2
61      1
99      1
Name: V12, dtype: int64
V13
1    3835
2    4263
Name: V13, dtype: int64
V4515
0    3244
1    1921
2     192
3     576
4      33
9    2132
Name: V4515, dtype: int64
V5114
0      67
1     391
2    1548
3    2579
4    1340
8       2
9    2171
Name: V5114, dtype: int64
V5118
0      67
1     131
2     629
3    2217
4    2871
8       3
9    2180
Name: V5118, dtype: int64
V5113
0      67
1     404
2    1342
3    2828
4    1294
9    2163
Name: V5113, dtype: int64
V5115
0      67
1     249
2    1287
3    3056
4    1273
8       1
9    2165
Name: V511

In [23]:
df1.query('V4428 == 99').index.tolist()

[]

In [None]:
df[var_dict['response']].value_counts()

In [None]:
for col in var_dict['cat_preds']:
    print(col + ":")
    print(df[col].value_counts())

In [None]:
# Make a train-test split:
train_set, test_set = train_test_split(df, random_state=1, stratify=df[var_dict['response']])


In [None]:
print(train_set.shape, test_set.shape)

In [None]:
# Define a method to select attributes from a DataFrame
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# We will need this later
#num_pipeline = Pipeline([
#    ('selector', DataFrameSelector(num_attribs)),
#    ('power_transf', PowerTransformer(method='yeo-johnson', standardize=False)),
#    ('minmax_scaler', MinMaxScaler()),
#])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

full_pipeline = FeatureUnion(transformer_list=[
    #("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])




In [None]:
#num_attribs = var_dict['num_preds']
cat_attribs = var_dict['cat_preds']


y_train = train_set[var_dict['response']]

# Prepare the training data with the pipeline above
X_train_prepared = cat_pipeline.fit_transform(train_set)
X_train_prepared

In [None]:
X_train_prepared.shape

In [None]:
X_test_prepared = cat_pipeline.transform(test_set)

In [None]:
X_test_prepared.shape

In [None]:
%whos

In [None]:
cat_pipeline.get_params

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train_prepared, y_train)

In [None]:
tree_pred = tree_clf.predict(X_test_prepared)

In [None]:
from sklearn.metrics import classification_report
y_train_pred = tree_clf.predict(X_train_prepared)
print(classification_report(y_train, y_train_pred))

In [None]:
tree_clf.classes_

In [None]:
y_test = test_set['V1144']

In [None]:
y_test_pred = tree_clf.predict(X_test_prepared)

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=1000, multi_class='auto', n_jobs=-1,verbose=1)
classifier

In [None]:
classifier.fit(X_train_prepared, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train_prepared, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_prepared, y_test)}")

In [None]:
y_test_pred = classifier.predict(X_test_prepared)

In [None]:
print(classification_report(y_test, y_test_pred))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
import seaborn as sns

In [None]:
conf_mx = confusion_matrix(y_test, y_test_pred)

In [None]:
classifier.classes_

In [None]:
conf_df= pd.DataFrame(conf_mx, index=classifier.classes_, columns=classifier.classes_)

In [None]:
import matplotlib.pyplot as plt
sns.heatmap(conf_df, annot=True, cmap="Blues")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.ylabel("True Class")
plt.xlabel("Predicted Class")
plt.show()

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
dummy = DummyClassifier(strategy='most_frequent', random_state=1)

In [None]:
dummy.fit(X_train_prepared, y_train)

In [None]:
dummy.score(X_test_prepared, y_test)

In [None]:
dummy.classes_

In [None]:
DummyClassifier?

In [None]:
dummy2 = DummyClassifier(strategy='constant', random_state=1, constant=5)

In [None]:
dummy2.fit(X_train_prepared, y_train)

In [None]:
dummy.score(X_test_prepared, y_test)   # same as 'most_frequent'

In [None]:
LogisticRegression?

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(classifier, X_train_prepared, y_train, cv=3, scoring='accuracy', n_jobs=-1, verbose=True)

In [None]:
from sklearn.base import BaseEstimator

In [None]:
import numpy as np

In [None]:
class Always0Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1))
    

In [None]:
always_0_classifer = Always0Classifier()

In [None]:
cross_val_score(always_0_classifer, X_train_prepared, y_train, cv=3, scoring='accuracy', n_jobs=-1)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
2633/len(y_train)

In [None]:
284/len(y_train)

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
y_cv_train_pred = cross_val_predict(classifier, X_train_prepared, y_train, cv=5, n_jobs=-1)

In [None]:
confusion_matrix(y_train, y_cv_train_pred)

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
precision_score(y_train, y_cv_train_pred, average='weighted')

In [None]:
recall_score(y_train, y_cv_train_pred, average='weighted')

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'C': [0.01, 0.1, 1, 5, 10, 50]}

In [None]:
grid = GridSearchCV(classifier, param_grid, verbose=3, n_jobs=-1)

In [None]:
grid.fit(X_train_prepared, y_train)

In [None]:
print(grid.best_score_)

In [None]:
print(grid.best_params_)

In [None]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8]}

In [None]:
grid = GridSearchCV(classifier, param_grid, verbose=3, n_jobs=-1)

In [None]:
grid.fit(X_train_prepared, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_