In this notebook I search for the best classifier and its parameters for tweets multi-class classifications based on authorship attributes.

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
import os

In [2]:
# Set the path to the folder containing CSV files
folder_path = 'tweets/'
os.listdir(folder_path)
            
            

['.ipynb_checkpoints',
 '10460KDNuggetsTweets.csv',
 'AdamSavageTweets.csv',
 'AllTweets.csv',
 'BarackObama.csv',
 'DonaldTrump2014-01-01To2016-10-14Tweets.csv',
 'DonaldTrumpTweets.csv',
 'FiveThirtyEightTweets.csv',
 'HillaryClinton2014-01-01To2016-10-14Tweets.csv',
 'HillaryClintonTweets.csv',
 'KimKardashianTweets.csv',
 'NeildeGrasseTysonTweets.csv',
 'RichardDawkins.csv',
 'ScottKelly.csv']

In [3]:
# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Create an empty list to store DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for csv_file in csv_files:
    csv_file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(csv_file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Print the combined DataFrame
combined_df

Unnamed: 0.2,Unnamed: 0,date,id,link,retweet,text,author,Unnamed: 0.1
0,0,Oct 4,783396985093193728,/missyscheng/status/783396985093193728,False,#DataScience Basics: #DataMining vs. #Statisti...,various,
1,1,Oct 4,783381842024103936,/EXASOLAG/status/783381842024103936,False,How to Become a #Data Scientist – Part 1: http...,various,
2,2,Oct 4,783433625723252736,/TarasNovak/status/783433625723252736,False,@jesterxl @kdnuggets or just go with @tableau :),various,
3,3,Oct 4,783428740453982208,/kdnuggets/status/783428740453982208,False,#Boston U. Online MS in Applied #Business #Ana...,various,
4,4,1h1 hour ago,787052623291641856,/kdnuggets/status/787052623291641856,False,#ICYMI Still Searching for ROI in #BigData Ana...,various,
...,...,...,...,...,...,...,...,...
173030,1214,24 Aug 2009,3506949420,/StationCDRKelly/status/3506949420,False,@karen4jazz thanks!,ScottKelly,
173031,1215,23 Aug 2009,3505850138,/StationCDRKelly/status/3505850138,False,The HARDEST thing about this ISS training is h...,ScottKelly,
173032,1216,23 Aug 2009,3500803828,/StationCDRKelly/status/3500803828,False,Eating breakfast at the Okura Frontier Hotel i...,ScottKelly,
173033,1217,23 Aug 2009,3488056654,/StationCDRKelly/status/3488056654,False,I think you will find the comparison (and cont...,ScottKelly,


In [4]:
df = combined_df
df = df.dropna()
print(df.isna().sum())

Unnamed: 0      0
date            0
id              0
link            0
retweet         0
text            0
author          0
Unnamed: 0.1    0
dtype: int64


In [5]:
pd.DataFrame(df.groupby('author').size().rename('counts')).sort_values('counts', ascending=False)

Unnamed: 0_level_0,counts
author,Unnamed: 1_level_1
DonaldTrump,17216
NASA,15910
KimKardashian,10688
various,10440
FiveThirtyEight,9761
BarackObama,6896
RichardDawkins,5839
AdamSavage,4872
HillaryClinton,3356
deGrasseTyson,2428


In [6]:
import random
from sklearn.model_selection import train_test_split
#1000 random sample rows for each author
df_new=pd.DataFrame()
twts_train=pd.DataFrame()
twts_test=pd.DataFrame()
author_train=pd.DataFrame()
author_test=pd.DataFrame()
for a in df.author.unique():
    rows = random.sample(list(df[df['author']==a].index), 1000)
    df_temp = df.loc[rows]
    # df_new=df_new.append(df_temp,ignore_index=True)   
    df_new = pd.concat([df_new, df_temp], ignore_index=True)
    X_train, X_test, Y_train, Y_test = train_test_split(df_temp.loc[:,['text']], df_temp.loc[:,['author']], test_size=0.2, random_state=42)
    twts_train=twts_train.append(X_train, verify_integrity=False)
    twts_test=twts_test.append(X_test, verify_integrity=False)
    author_train=author_train.append(Y_train, verify_integrity=False)
    author_test=author_test.append(Y_test, verify_integrity=False)

Train set:

In [7]:
print (len(twts_train),len(author_train))

8800 8800


Test set:

In [8]:
print(len(twts_test),len(author_test))

2200 2200


In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
def text_process(text):
    """
    Takes in a string of text, then performs the following:
    1. Tokenizes and removes punctuation
    3. Stems
    4. Returns a list of the cleaned text
    """

    # tokenizing
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    
    # steming
    porter_stemmer = PorterStemmer()
    
    text_processed = [porter_stemmer.stem(word) for word in text_processed]
    

    return text_processed

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

In [11]:
ScoreSummaryByModel = list()

In [12]:
def PredictionEvaluation(author_test_b,author_predicted_b,target_names,comment):
    Accuracy=accuracy_score(author_test_b,author_predicted_b)
    #print (Accuracy)
    Recall=recall_score(author_test_b, author_predicted_b, labels=[0,1,2,3], average='macro')
    #print (Recall)
    Precision=precision_score(author_test_b, author_predicted_b, labels=[0,1,2,3], average='macro')
    #print (Precision)
    F1=f1_score(author_test_b, author_predicted_b, labels=[0,1,2,3], average='macro')
    #print (F1)
    ScoreSummaryByModel.append([Accuracy,Recall,Precision,F1,comment])
    print(classification_report(author_test_b, author_predicted_b, target_names=target_names))

In [13]:
import matplotlib.pyplot as plt
%matplotlib inline
import itertools

In [14]:
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [15]:
ScoreSummaryByModelParams=list()

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer

In [24]:
def ModelParamsEvaluation (f_union,model,params,comment):
    pipeline = Pipeline([
    # Extract the text & text_coded
    # Use FeatureUnion to combine the features from different vectorizers
    ('union', f_union),
    # Use a  classifier on the combined features
    ('clf', model)
    ])
    grid_search = GridSearchCV(estimator=pipeline, param_grid=params, verbose=1)
    grid_search.fit(twts_train['text'], author_train['author'])
    author_predicted = grid_search.predict(twts_test['text'])
    lb = LabelBinarizer()
    author_test_b = lb.fit_transform(author_test['author'])
    author_predicted_b  = lb.fit_transform(author_predicted)
    #best score
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    author_names=grid_search.best_estimator_.named_steps['clf'].classes_

    for param_name in sorted(params.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        ScoreSummaryByModelParams.append([comment,grid_search.best_score_,"\t%s: %r" % (param_name, best_parameters[param_name])]) 
    return (author_predicted,author_predicted_b,author_test_b,author_names)

In [37]:
f2_union=FeatureUnion(
        transformer_list=[
            # Pipeline for pulling char features  from the text
            ('char', Pipeline([
                ('tfidf',     TfidfVectorizer(analyzer='char',ngram_range=(1, 5))),
            ])),
            # Pipeline for pulling stememd word features from the text
            # ('text', Pipeline([
            #     ('tfidf',    TfidfVectorizer(analyzer='word',tokenizer= text_process,ngram_range=(1, 1))),
            # ])),        

        ],

    )

In [38]:
from sklearn.svm import LinearSVC
#LinearSVC
p = {'clf__C': (1,0.1,0.01,0.001,0.0001)}
(author_predicted,author_predicted_b, author_test_b,author_names)=ModelParamsEvaluation(f2_union,LinearSVC(),p,'LinearSVC')

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best score: 0.881
Best parameters set:
	clf__C: 1


In [39]:
PredictionEvaluation(author_predicted_b, author_test_b,author_names,'LinearSVC')

                 precision    recall  f1-score   support

     AdamSavage       0.78      0.85      0.81       182
    BarackObama       0.85      0.94      0.89       180
    DonaldTrump       0.90      0.85      0.87       212
FiveThirtyEight       0.99      1.00      0.99       198
 HillaryClinton       0.90      0.83      0.87       216
  KimKardashian       0.93      0.93      0.93       199
           NASA       0.93      0.95      0.94       194
 RichardDawkins       0.86      0.85      0.86       204
     ScottKelly       0.92      0.90      0.91       204
  deGrasseTyson       0.87      0.83      0.85       210
        various       1.00      1.00      1.00       201

      micro avg       0.90      0.90      0.90      2200
      macro avg       0.90      0.90      0.90      2200
   weighted avg       0.90      0.90      0.90      2200
    samples avg       0.90      0.90      0.90      2200

