In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import re
import os
# from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
# Set the path to the folder containing CSV files
folder_path = 'tweets/'
os.listdir(folder_path)
            
            

['.ipynb_checkpoints',
 '10460KDNuggetsTweets.csv',
 'AdamSavageTweets.csv',
 'AllTweets.csv',
 'BarackObama.csv',
 'DonaldTrump2014-01-01To2016-10-14Tweets.csv',
 'DonaldTrumpTweets.csv',
 'FiveThirtyEightTweets.csv',
 'HillaryClinton2014-01-01To2016-10-14Tweets.csv',
 'HillaryClintonTweets.csv',
 'KimKardashianTweets.csv',
 'NeildeGrasseTysonTweets.csv',
 'RichardDawkins.csv',
 'ScottKelly.csv']

In [3]:
# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Create an empty list to store DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for csv_file in csv_files:
    csv_file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(csv_file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Print the combined DataFrame
combined_df

Unnamed: 0.2,Unnamed: 0,date,id,link,retweet,text,author,Unnamed: 0.1
0,0,Oct 4,783396985093193728,/missyscheng/status/783396985093193728,False,#DataScience Basics: #DataMining vs. #Statisti...,various,
1,1,Oct 4,783381842024103936,/EXASOLAG/status/783381842024103936,False,How to Become a #Data Scientist – Part 1: http...,various,
2,2,Oct 4,783433625723252736,/TarasNovak/status/783433625723252736,False,@jesterxl @kdnuggets or just go with @tableau :),various,
3,3,Oct 4,783428740453982208,/kdnuggets/status/783428740453982208,False,#Boston U. Online MS in Applied #Business #Ana...,various,
4,4,1h1 hour ago,787052623291641856,/kdnuggets/status/787052623291641856,False,#ICYMI Still Searching for ROI in #BigData Ana...,various,
...,...,...,...,...,...,...,...,...
173030,1214,24 Aug 2009,3506949420,/StationCDRKelly/status/3506949420,False,@karen4jazz thanks!,ScottKelly,
173031,1215,23 Aug 2009,3505850138,/StationCDRKelly/status/3505850138,False,The HARDEST thing about this ISS training is h...,ScottKelly,
173032,1216,23 Aug 2009,3500803828,/StationCDRKelly/status/3500803828,False,Eating breakfast at the Okura Frontier Hotel i...,ScottKelly,
173033,1217,23 Aug 2009,3488056654,/StationCDRKelly/status/3488056654,False,I think you will find the comparison (and cont...,ScottKelly,


In [4]:
# Load dataset
# data = pd.read_csv('blogtext_short.csv')
# data.head()


In [5]:
# create a dictionary to store the sampled records for each author
sampled_data = {}

# get a list of unique author labels
authors = combined_df['author'].unique()

# loop through each author and sample 1000 records of text
for author in authors:
    author_data = combined_df[combined_df['author'] == author][['text','author']].sample(n=1000)
    sampled_data[author] = author_data

# combine the sampled data for each author into a single dataframe
df = pd.concat(sampled_data.values())

df

Unnamed: 0,text,author
41362,#Python and R are now neck to neck in the late...,various
41167,Whitepaper: The Journey to #OpenDataScience ht...,various
45178,#DeepLearning in #Healthcare Part 1: Opportuni...,various
39488,RT RT @kdnuggets JupyterLab: the next generat...,various
36442,Top Algorithms and Methods Used by Data Scient...,various
...,...,...
169130,Sam says Hamas publicly says they'd like to ki...,RichardDawkins
98753,This crowd brought their children to watch ISI...,RichardDawkins
168669,Wonderful conversation with @NeilTyson in that...,RichardDawkins
169495,"""My wife and I join Xs the world over celebrat...",RichardDawkins


In [6]:
df['author'].value_counts()

various            1000
AdamSavage         1000
NASA               1000
BarackObama        1000
DonaldTrump        1000
FiveThirtyEight    1000
HillaryClinton     1000
KimKardashian      1000
deGrasseTyson      1000
ScottKelly         1000
RichardDawkins     1000
Name: author, dtype: int64

### Split dataset

In [7]:
X = df['text']
y = df['author']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [8]:
# Clean and preprocess text data
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming and Lemmatization
    # stemmer = PorterStemmer()
    # lemmatizer = WordNetLemmatizer()
    # words_stem = [stemmer.stem(word) for word in words]
    # words_lem = [lemmatizer.lemmatize(word) for word in words]

    # Normalization
    text_clean = ' '.join(words)
    text_clean = re.sub('\s+', ' ', text_clean).strip()

    return text_clean

In [9]:

X1_train = X_train.apply(clean_text)
X1_test = X_test.apply(clean_text)

## Feature extraction methods

### Bag of Word

In [10]:
def bag_of_word(X_train, X_test):
    # Extract bag of words features
    
    vectorizer = CountVectorizer(stop_words='english')
    train_features = vectorizer.fit_transform(X_train)
    test_features = vectorizer.transform(X_test)
    
    return train_features, test_features

### TF-IDF

In [11]:
def tf_idf(X_train, X_test):

    # Create a TfidfVectorizer object to extract features
    tfidf = TfidfVectorizer()

    # fit and Transform the training and testing data using the vectorizer
    train_features =  tfidf.fit_transform(X_train)
    test_features = tfidf.transform(X_test)
    
    return train_features, test_features

### N-Gram

In [44]:
def n_gram(X_train, X_test):
    
    tfidf = TfidfVectorizer(ngram_range=(1, 1))

    # fit and Transform the training and testing data using the vectorizer
    train_features =  tfidf.fit_transform(X_train)
    test_features = tfidf.transform(X_test)
    
    return train_features, test_features

## Classifiers

### Random Forest

In [13]:
def random_forest(train_features, y_train, test_features, y_test):

    # Train Random Forest classifier
    clf = RandomForestClassifier()
    clf.fit(train_features, y_train)

    # Predict on test set and measure accuracy
    y_pred = clf.predict(test_features)
    accuracy = accuracy_score(y_test, y_pred)
    PredictionEvaluation(y_test, y_pred)
    
    # return (f"Accuracy: {accuracy:.2f}")

### Logistic Regression

In [22]:
def Logistic_Regression(train_features, y_train, test_features, y_test):

    # Create a Logistic Regression model
    lr = LogisticRegression()

    # Fit the model on the training data
    lr.fit(train_features, y_train)

    # Make predictions on the testing data
    predictions = lr.predict(test_features)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_test, predictions)
    PredictionEvaluation(y_test, predictions)
    
    # return (f"Accuracy: {accuracy:.2f}")

### Naive Bayes

In [54]:
def Naive_Bayes(train_features, y_train, test_features, y_test):
    params = {'alpha': [0.5, 1, 2, 5, 10]}
    nb = MultinomialNB()
    nb_gs = GridSearchCV(nb, params, cv=5)
    nb_gs.fit(train_features, y_train)
    print("Best parameters: ", nb_gs.best_params_)
    y_pred = nb_gs.predict(test_features)
    
    # author_names=nb_gs.best_estimator_.named_steps['clf'].classes_
    # print("Accuracy score: ", accuracy_score(y_test, y_pred))
    PredictionEvaluation(y_test, y_pred)

### Evaluation

In [49]:
from sklearn.metrics import classification_report

In [58]:
def PredictionEvaluation(author_test_b,author_predicted_b):
# def PredictionEvaluation(author_test_b,author_predicted_b):
    Accuracy=accuracy_score(author_test_b,author_predicted_b)
    print ('Accuracy', Accuracy)
    Recall=recall_score(author_test_b, author_predicted_b, average= 'macro')
    print ('Recall', Recall)
    Precision=precision_score(author_test_b, author_predicted_b, average= 'macro')
    print ('Precision', Precision)
    F1=f1_score(author_test_b, author_predicted_b, average= 'macro')
    print ('F1' , F1)
    # ScoreSummaryByModel.append([Accuracy,Recall,Precision,F1,comment])
    print(classification_report(author_test_b, author_predicted_b))

In [63]:
train_features, test_features = tf_idf(X_train, X_test)
# random_forest(train_features, y_train, test_features, y_test)
Logistic_Regression(train_features, y_train, test_features, y_test)
# Naive_Bayes(train_features, y_train, test_features, y_test)

Accuracy 0.8322727272727273
Recall 0.8313237548754171
Precision 0.8342430916765334
F1 0.832296102726934
                 precision    recall  f1-score   support

     AdamSavage       0.72      0.71      0.71       202
    BarackObama       0.89      0.81      0.85       214
    DonaldTrump       0.74      0.76      0.75       197
FiveThirtyEight       1.00      0.98      0.99       204
 HillaryClinton       0.80      0.85      0.82       210
  KimKardashian       0.76      0.77      0.76       178
           NASA       0.91      0.85      0.88       184
 RichardDawkins       0.79      0.82      0.80       213
     ScottKelly       0.88      0.86      0.87       194
  deGrasseTyson       0.73      0.77      0.75       198
        various       0.98      0.97      0.97       206

       accuracy                           0.83      2200
      macro avg       0.83      0.83      0.83      2200
   weighted avg       0.84      0.83      0.83      2200



'Accuracy: 0.83'

In [62]:
# train_features, test_features = bag_of_word(X_train, X_test)
# random_forest(train_features, y_train, test_features, y_test)
# Logistic_Regression(train_features, y_train, test_features, y_test)
# Naive_Bayes(train_features, y_train, test_features, y_test)

In [66]:
train_features, test_features = n_gram(X_train, X_test)
# random_forest(train_features, y_train, test_features, y_test)
# Logistic_Regression(train_features, y_train, test_features, y_test)
Naive_Bayes(train_features, y_train, test_features, y_test)

Best parameters:  {'alpha': 0.5}
Accuracy 0.8445454545454546
Recall 0.8436307791485124
Precision 0.8464279705633722
F1 0.8432887792454359
                 precision    recall  f1-score   support

     AdamSavage       0.82      0.72      0.77       202
    BarackObama       0.76      0.91      0.83       214
    DonaldTrump       0.72      0.76      0.74       197
FiveThirtyEight       0.97      0.97      0.97       204
 HillaryClinton       0.83      0.83      0.83       210
  KimKardashian       0.86      0.75      0.80       178
           NASA       0.85      0.96      0.90       184
 RichardDawkins       0.85      0.81      0.83       213
     ScottKelly       0.88      0.87      0.88       194
  deGrasseTyson       0.80      0.74      0.77       198
        various       0.97      0.97      0.97       206

       accuracy                           0.84      2200
      macro avg       0.85      0.84      0.84      2200
   weighted avg       0.85      0.84      0.84      2200

