In [1]:
from preprocess import denoise_text
from logistic import fit_logistic_classifier
import pandas as pd
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [2]:
# Load the training data
train = pd.read_csv('data/train.csv', lineterminator='\n')
# Preprocess the text
train['text'] = train['text'].astype(str).apply(denoise_text)

In [3]:
# Fit the classifier
data = train[['text','label']].values.tolist()
model = fit_logistic_classifier(data)

In [4]:
# Load the validation and testing data
valid = pd.read_csv('data/valid.csv', lineterminator='\n')
test = pd.read_csv('data/test.csv', lineterminator='\n')
# Preprocess the text
valid['text'] = valid['text'].astype(str).apply(denoise_text)
test['text'] = test['text'].astype(str).apply(denoise_text)
# Encode the labels
y_valid = model['label_encoder'].transform(valid.label)
y_test = model['label_encoder'].transform(test.label)
# Vectorize the text
X_valid = model['vectorizer'].transform(valid.text)
X_test = model['vectorizer'].transform(test.text)

In [5]:
# Inspect classifier performance
print('Validation data:\n')
print(metrics.classification_report(y_valid,model['clf'].predict(X_valid)))
print('\n\nTesting data (unseen twitter handles):\n')
print(metrics.classification_report(y_test,model['clf'].predict(X_test)))

Validation data:

              precision    recall  f1-score   support

           0       0.90      0.92      0.91    357773
           1       0.94      0.92      0.93    433777

    accuracy                           0.92    791550
   macro avg       0.92      0.92      0.92    791550
weighted avg       0.92      0.92      0.92    791550



Testing data (unseen twitter handles):

              precision    recall  f1-score   support

           0       0.73      0.85      0.79    142178
           1       0.90      0.81      0.85    231942

    accuracy                           0.83    374120
   macro avg       0.82      0.83      0.82    374120
weighted avg       0.84      0.83      0.83    374120



In [7]:
# Save and load the classifier
import pickle
pkl_filename = "tweetyCARDS.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)