In [13]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from dataextractor import get_data
from sklearn.pipeline import Pipeline


In [17]:
# Step 1 − Collecting the Dataset
df = get_data(return_DF=True)

# Step 2− Preprocessing the Data

# Remove URLs
df['text'] = df['text'].apply(lambda x: re.compile(r'https?://\S+|www\.\S+').sub(r'',x))
# Remove punctuation
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
# Remove HTML:
df['text'] = df['text'].apply(lambda x: re.compile(r'<.*?>').sub(r'',x))
# Remove emojis:
df['text'] = df['text'].apply(lambda x: re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE).sub(r'',x))
# Transform shortened words in full form:
def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text 

df['text'] = df['text'].apply(lambda x: decontraction(x))
# Keep only letters
df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
# Lower case
df['text'] = df['text'].apply(lambda x: x.lower())
# Remove stop words
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in set(stopwords.words('english'))]))

# Step 3− Creating the TF-IDF Matrix
X = df['text']
y = df['label'].apply(lambda x: {'positive':1.0,'neutral':0.0,'negative':-1.0}[x])

# Step 4− Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [22]:
print("TRAIN")
print(X.head(),y.head())

# Step 5− Set up pipeline
model = Pipeline([
        ('vect',CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf',MultinomialNB())])

model.fit(X_train, y_train)

TRAIN
0    match getin plusu c tomorrow busy dayu c aware...
1    area new england could see first flake season ...
2    francescocon nd worst qb definitely tony romo ...
3    thailand washington u president barack obama v...
4    yu hear tony romo dressed halloween giant quat...
Name: text, dtype: object 0    0.0
1    0.0
2   -1.0
3    0.0
4    0.0
Name: label, dtype: float64


In [23]:
# Step 6− Evaluating the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy: {accuracy:}")
print(f"Precision: {precision:}")
print(f"Recall: {recall:}")
print(f"F1 score: {f1:}")

Accuracy: 0.6085568963797746
Precision: 0.6969255235635737
Recall: 0.4799961673756637
F1 score: 0.44817609582475626
