In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import joblib
import spacy

# Load Training Data 

In [2]:
def load_data(file): 
    return pd.read_csv(file, names=['id', 'entity', 'label', 'text']).set_index('id')


In [3]:
df = load_data("~/sentiment_analysis/archive/twitter_training.csv")

# Examine Data 

In [4]:
df.sample(5)

Unnamed: 0_level_0,entity,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1536,Battlefield,Irrelevant,For Buurrrrnnnnn Baby Burn!! Check me out on t...
556,ApexLegends,Neutral,Thank you guys for supporting my streams. We h...
6541,Fortnite,Negative,I Edit
7583,LeagueOfLegends,Neutral,Ever since I started LeagueOfLegends I've had ...
2653,Borderlands,Neutral,and Shitting around | Borderlands 3 | Part 5.5...


In [5]:
df['label'].value_counts()

label
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74682 entries, 2401 to 9200
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   entity  74682 non-null  object
 1   label   74682 non-null  object
 2   text    73996 non-null  object
dtypes: object(3)
memory usage: 2.3+ MB


# Clean Data 

In [7]:
def clean(df): 
    return df.drop_duplicates().dropna()

In [8]:
df = clean(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70958 entries, 2401 to 9200
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   entity  70958 non-null  object
 1   label   70958 non-null  object
 2   text    70958 non-null  object
dtypes: object(3)
memory usage: 2.2+ MB


In [9]:
df.sample(5)

Unnamed: 0_level_0,entity,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5693,HomeDepot,Positive,Stop Your bleed
9727,PlayStation5(PS5),Irrelevant,YALL FR FR GOT NO CHILL .
6196,FIFA,Positive,Spent most of Quarantine playing FIFA and winn...
1305,Battlefield,Irrelevant,40% rise in Xbox 360 player GnomeVillage has o...
8322,Microsoft,Negative,I'd cry. I'd cry loud tears. Will it happen? D...


# Preprocess Training Data 

In [10]:
nlp = spacy.load("en_core_web_sm")
encoder = LabelEncoder()

In [11]:
def preprocesss_text(text):
    doc = nlp(text)
    ret = []
    for token in doc:
        if token.is_stop: 
            continue 
        ret.append(token.lemma_)
    return ' '.join(ret)

def preprocess(df, training=True): 
    if training:
        df['label_num'] = encoder.fit_transform(df['label'])
    else:
        df['label_num'] = encoder.transform(df['label'])
    df['text_processed'] = df['text'].apply(preprocesss_text)
    # add entity into text body 
    df['combined'] = df['entity'] + ' ' + df['text_processed']

In [12]:
preprocess(df)

# Store Preprocessed Data to Save Time for Future Usage 

In [13]:
df.to_csv("~/sentiment_analysis/archive/twitter_training_preprocessed.csv")

# Split Training Data 

In [18]:
vectorizer = TfidfVectorizer()

In [19]:
def get_data_for_model(df, training=True): 
    if training: 
        return vectorizer.fit_transform(df['text_processed']), df['label_num']
        #return vectorizer.fit_transform(df['combined']), df['label_num']
    else:
        return vectorizer.transform(df['text_processed']), df['label_num']
        #return vectorizer.transform(df['combined']), df['label_num']

In [20]:
X, y = get_data_for_model(df)

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build Models 


In [21]:

models = {
    'NaiveBayes': MultinomialNB(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear', C=1),
}

# Train Model 

In [22]:
for model in models.values(): 
    model.fit(X, y)

# Persist Models 

In [38]:
for name, model in models.items(): 
    joblib.dump(model,  name+'.model.joblib')

In [None]:
for name in models: 
    models[name] = joblib.load(name+'.model.joblib')

# Caculate precision

In [23]:
test_df = load_data("~/sentiment_analysis/archive/twitter_validation.csv")
test_df = clean(test_df)
preprocess(test_df, training=False)
X_test, y_test = get_data_for_model(test_df, training=False)

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    training_score = model.score(X, y)
    testing_score = model.score(X_test, y_test)
    #acc = accuracy_score(y_test, y_pred)
    #print(name + ' Accuracy: ', acc * 100)
    print(name, ' training score (bias): ', training_score * 100, '; testing score (variance): ', testing_score * 100)

NaiveBayes  training score (bias):  78.50700414329603 ; testing score (variance):  79.9
LogisticRegression  training score (bias):  85.8972913554497 ; testing score (variance):  90.5


# Compare LR and SVM

In [55]:
y_predict_LG = models['LogisticRegression'].predict(X_test)
y_predict_SVM = models['SVM'].predict(X_test)
test_df['predict_LG'] = encoder.inverse_transform(y_predict_LG)
test_df['predict_SVM'] = encoder.inverse_transform(y_predict_SVM)
pd.set_option('display.max_colwidth', None)

selected_columns = ['entity','text','label', 'predict_LG', 'predict_SVM']
criteria = test_df['predict_LG'] != test_df['predict_SVM']
test_df.loc[criteria, selected_columns]

Unnamed: 0_level_0,entity,text,label,predict_LG,predict_SVM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2286,CallOfDuty,"Call of duty warzone (livestream) w/ subs #Warzone youtu.be/7BhH_pjOMU4 via @YouTube Please come watch this AMAZING Call of Duty Warzone stream from this AMAZING streamer! It'd be really, really nice to give him some views and likes as well! 😀 #COD #CallofDuty #Warzone",Irrelevant,Positive,Irrelevant
3293,Facebook,"@BeverlyCitizen Ronald Bellanti is a resident of Beverly, MA and works for Ground Control, an organization he founded as a drunk driving prevention campaign. He took to Facebook to express deplorably racist and violent views. Please help expose him! pic.twitter.com/78RitUkeM2",Positive,Irrelevant,Positive
11277,TomClancysRainbowSix,Ok I'm blocking this man's he is on a new level of being,Irrelevant,Positive,Negative
11020,TomClancysGhostRecon,This is how much I enjoy #ghostreconbreakpoint and #division2 I love the lore of both titles. #gaming #gamebooks instagram.com/p/B9feINlnn1U/…,Positive,Neutral,Positive
9608,PlayStation5(PS5),God when 🥺,Negative,Negative,Positive
4132,CS-GO,I used a voice changer and pretended to be a girl in a csgo match last night and now my DMs are flooded xD,Neutral,Irrelevant,Neutral
11175,TomClancysGhostRecon,GM Fam!!! hope you are all up and being great today.... jus wanted to take a second to thank all my followers for the support you guys are truly awesome💪🏾👍🏾🙌🏾... #stateofdecay2 #ghostreconbreakpoint #RedDeadRedemption2 #SmallStreamerCommunity,Neutral,Positive,Neutral
8174,Microsoft,#Indigo Urgent Care looks to Microsoft Teams and Microsoft’s Power Platform to help deliver quality care and world-class patient experience lnkd.in/eAzWmuB,Positive,Neutral,Negative
1714,CallOfDutyBlackopsColdWar,I know I trash BO1 multiplayer any chance I get but in all honesty I wouldn’t mind a Black Ops remaster. Nothing could ever be as worse as MW2019. Get to work on it...,Neutral,Negative,Neutral
13044,Xbox(Xseries),"You can catch up with all of ""The Xbox Has No Games Podcast"" right here:\n\nintromediagaming.com/home/the-xbox-…\n\nOne of the best Xbox Podcast out here\n\n#Xbox #XboxSeriesX #XboxSeriesS https://t.co/CmlRJsgZyk",Irrelevant,Neutral,Irrelevant
