In [None]:
#!/usr/bin/env python3
# Written by: Hans Müller Paul
#                           NOTES:

**Determine starting time**

In [None]:
from datetime import datetime

t0 = datetime.now()

**Importing input file**

In [3]:
from pandas import read_csv     #pip install pandas if this library isn't already installed

with open('homework.csv', 'r') as f:
    tweet_df = read_csv(f, header = 0, low_memory=False) #the low_memory flag is required when loading large datasets
    del tweet_df['account_category']

**Generating temporary dataset for faster testing:** comment out to run on entire dataset

In [4]:
# temp_data = tweet_df.head(n=100000)

**Vectorizing word count**

In [5]:
from sklearn.feature_extraction.text import CountVectorizer #pip install sklearn if this library isn't already installed

data_vectorizer = CountVectorizer(stop_words='english')
tweet_data = data_vectorizer.fit_transform(tweet_df.content) #I believe this step is the equivalent to StringToWordVector on Weka
# tweet_data = data_vectorizer.fit_transform(temp_data.content)

**Training model #1:** Multinomial Naïve Bayes, 4-fold cross-validation

In [6]:
from sklearn.model_selection import StratifiedKFold 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, cohen_kappa_score
from pandas import DataFrame

X = tweet_data
y = tweet_df['troll']
# y = temp_data['troll']
skf = StratifiedKFold(n_splits=4)
model = MultinomialNB()
metrics_list = []
for train_index, test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    accuracy = model.score(X_test, y_test)
    precision = precision_score(y_test, model.predict(X_test))
    recall = recall_score(y_test, model.predict(X_test))
    kappa = cohen_kappa_score(y_test, model.predict(X_test))
    metrics_list.append([accuracy,precision,recall,kappa])
metrics_1 = DataFrame(metrics_list, columns=['accuracy','precision','recall','kappa_statistic'])


**Reviewing metrics #1:**  Multinomial Naïve Bayes, 4-fold cross-validation

In [7]:
print(f'''
Average metrics:
{metrics_1.mean(axis=0)}
''')


Average metrics:
accuracy           0.801280
precision          0.711088
recall             0.962854
kappa_statistic    0.611886
dtype: float64



**Training model #2:** Multinomial Naïve Bayes, 10-fold cross-validation

In [8]:
from sklearn.model_selection import StratifiedKFold 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, cohen_kappa_score
from pandas import DataFrame

X = tweet_data
y = tweet_df['troll']
# y = temp_data['troll']
skf = StratifiedKFold(n_splits=10)
model = MultinomialNB()
metrics_list = []
for train_index, test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    accuracy = model.score(X_test, y_test)
    precision = precision_score(y_test, model.predict(X_test))
    recall = recall_score(y_test, model.predict(X_test))
    kappa = cohen_kappa_score(y_test, model.predict(X_test))
    metrics_list.append([accuracy,precision,recall,kappa])
metrics_2 = DataFrame(metrics_list, columns=['accuracy','precision','recall','kappa_statistic'])


**Reviewing metrics #2:**  Multinomial Naïve Bayes, 10-fold cross-validation

In [9]:
print(f'''
Average metrics:
{metrics_2.mean(axis=0)}
''')


Average metrics:
accuracy           0.825691
precision          0.757830
recall             0.964288
kappa_statistic    0.661373
dtype: float64



**Training model #3:** Multinomial Naïve Bayes with TF/IDF, 4-fold cross-validation

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import StratifiedKFold 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, cohen_kappa_score
from pandas import DataFrame

tfidf_transformer = TfidfTransformer(use_idf=True).fit(tweet_data)
tfidf_tweets = tfidf_transformer.transform(tweet_data) #this step performs a transformation from word count to TF/IDF

X = tfidf_tweets
y = tweet_df['troll']
# y = temp_data['troll']
skf = StratifiedKFold(n_splits=4)
model = MultinomialNB()
metrics_list = []
for train_index, test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    accuracy = model.score(X_test, y_test)
    precision = precision_score(y_test, model.predict(X_test))
    recall = recall_score(y_test, model.predict(X_test))
    kappa = cohen_kappa_score(y_test, model.predict(X_test))
    metrics_list.append([accuracy,precision,recall,kappa])
metrics_3 = DataFrame(metrics_list, columns=['accuracy','precision','recall','kappa_statistic'])


**Reviewing metrics #3:**  Multinomial Naïve Bayes with TF/IDF, 4-fold cross-validation

In [11]:
print(f'''
Average metrics:
{metrics_3.mean(axis=0)}
''')


Average metrics:
accuracy           0.822511
precision          0.741541
recall             0.942598
kappa_statistic    0.650516
dtype: float64



**Training model #4:** Stochastic Gradient Descent Classifier, 4-fold cross-validation

In [12]:
from sklearn.model_selection import StratifiedKFold 
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score, recall_score, cohen_kappa_score
from pandas import DataFrame

X = tweet_data
y = tweet_df['troll']
# y = temp_data['troll']
skf = StratifiedKFold(n_splits=4)
model = SGDClassifier(max_iter = 1000)
metrics_list = []
for train_index, test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model.fit(X_train,y_train)
    accuracy = model.score(X_test, y_test)
    precision = precision_score(y_test, model.predict(X_test))
    recall = recall_score(y_test, model.predict(X_test))
    kappa = cohen_kappa_score(y_test, model.predict(X_test))
    metrics_list.append([accuracy,precision,recall,kappa])
metrics_4 = DataFrame(metrics_list, columns=['accuracy','precision','recall','kappa_statistic'])


**Reviewing metrics #4:**  Stochastic Gradient Descent Classifier, 4-fold cross-validation

In [13]:
print(f'''
Average metrics:
{metrics_4.mean(axis=0)}
''')


Average metrics:
accuracy           0.841417
precision          0.818180
recall             0.846675
kappa_statistic    0.681604
dtype: float64



**Determine duration of run**

In [14]:
t1 = datetime.now()
print(f'Total duration: {(t1-t0)}')

Total duration: 0:01:13.412365
