In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
print("Training Set:"% train.columns, train.shape, len(train))

Training Set: (31962, 3) 31962


In [3]:
test = pd.read_csv('test.csv')
print("Test Set:"% test.columns, test.shape, len(test))

Test Set: (17197, 2) 17197


In [4]:
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    return df

In [5]:
train_clean = clean_text(train, "tweet")

In [6]:
test_clean = clean_text(test, "tweet")

In [7]:
from sklearn.utils import resample
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]

In [8]:
train_minority_upsampled = resample(train_minority,
                                 replace=True,
                                 n_samples=len(train_majority),
                                 random_state=123)

In [9]:
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

In [11]:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),])

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'], train_upsampled['label'],random_state = 0)

In [14]:
model = pipeline_sgd.fit(X_train, y_train)

In [15]:
y_predict = model.predict(X_test)

In [17]:
from sklearn.metrics import f1_score,accuracy_score, classification_report, confusion_matrix
print(f1_score(y_test, y_predict))

0.9696484557401107


In [18]:
print(accuracy_score(y_test, y_predict))

0.9693808882907133


In [20]:
print(confusion_matrix(y_test, y_predict))

[[7137  353]
 [ 102 7268]]


In [22]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      7490
           1       0.95      0.99      0.97      7370

    accuracy                           0.97     14860
   macro avg       0.97      0.97      0.97     14860
weighted avg       0.97      0.97      0.97     14860



In [23]:
import joblib

In [24]:
joblib.dump(model, 'model.pkl')

['model.pkl']