In [168]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import spacy

# Load Training Data 

In [169]:
def load_data(file): 
    return pd.read_csv(file, names=['id', 'entity', 'label', 'text']).set_index('id')


In [170]:
df = load_data("~/pythonprograms/archive/twitter_training.csv")

# Examine Data 

In [171]:
df.sample(5)

Unnamed: 0_level_0,entity,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6980,johnson&johnson,Positive,Interesting
8446,NBA2K,Negative,@ N2K-park does not work on xbox
2066,CallOfDuty,Negative,bitch i did not play call of duty all these ye...
11917,Verizon,Neutral,There's times when I miss the old no-cell-phon...
1907,CallOfDutyBlackopsColdWar,Negative,Hell yeah!


In [172]:
df['label'].value_counts()

label
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74682 entries, 2401 to 9200
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   entity  74682 non-null  object
 1   label   74682 non-null  object
 2   text    73996 non-null  object
dtypes: object(3)
memory usage: 2.3+ MB


# Clean Data 

In [174]:
def clean(df): 
    return df.drop_duplicates().dropna()

In [175]:
df = clean(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70958 entries, 2401 to 9200
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   entity  70958 non-null  object
 1   label   70958 non-null  object
 2   text    70958 non-null  object
dtypes: object(3)
memory usage: 2.2+ MB


In [176]:
df.sample(5)

Unnamed: 0_level_0,entity,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3480,Facebook,Neutral,Lol @ Facebook blocks links to negative websit...
6636,Fortnite,Irrelevant,Am feeling good??
8266,Microsoft,Negative,Sneaky... @microsoft @Windows decided to sneak...
9050,Nvidia,Positive,NVIDIA SHIELD UPDATE - NEW FEATURE! This is Af...
10888,TomClancysGhostRecon,Negative,had to go underground to get a double kill . .


# Preprocess Training Data 

In [177]:
nlp = spacy.load("en_core_web_sm")
encoder = LabelEncoder()

In [178]:
def preprocesss_text(text):
    doc = nlp(text)
    ret = []
    for token in doc:
        if token.is_stop: 
            continue 
        ret.append(token.lemma_)
    return ' '.join(ret)

def preprocess(df, training=True): 
    if training:
        df['label_num'] = encoder.fit_transform(df['label'])
    else:
        df['label_num'] = encoder.transform(df['label'])
    df['text_processed'] = df['text'].apply(preprocesssText)
    # add entity into text body 
    df['combined'] = df['entity'] + ' ' + df['text_processed']

In [161]:
X = vectorizer.fit_transform(df['combined'])

In [162]:
df['label_num'] = encoder.fit_transform(df['label'])

In [180]:
preprocess(df)

# Split Training Data 

In [181]:
vectorizer = TfidfVectorizer()

In [183]:
def get_data_for_model(df, training=True): 
    if training: 
        return vectorizer.fit_transform(df['combined']), df['label_num']
    else:
        return vectorizer.transform(df['combined']), df['label_num']

In [184]:
X, y = get_data_for_model(df)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model 

In [185]:
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

# Caculate precision

In [186]:
test_df = load_data("~/pythonprograms/archive/twitter_validation.csv")
test_df = clean(test_df)
preprocess(test_df, training=False)
X_test, y_test = get_data_for_model(test_df, training=False)

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy: ', acc * 100)

Accuracy:  89.7
