In [1]:
import pandas as pd
import nltk
import re
import numpy as np
from nltk.corpus import stopwords
nltk.download('punkt')
import os

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lokes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Extracting lexicons from previous tsv files
#Subreddits
file_names = ['3DS.tsv', '4chan.tsv', '2007scape.tsv', 'ACTrade.tsv', 'amiugly.tsv', 'BabyBumps.tsv', 'baseball.tsv', 'canada.tsv', 'CasualConversation.tsv', 'DarknetMarkets.tsv', 'darksouls.tsv', 'elderscrollsonline.tsv', 'Eve.tsv', 'Fallout.tsv', 'fantasyfootball.tsv', 'GameDeals.tsv', 'gamegrumps.tsv', 'halo.tsv', 'Homebrewing.tsv', 'IAmA.tsv', 'india.tsv', 'jailbreak.tsv', 'Jokes.tsv', 'KerbalSpaceProgram.tsv', 'Keto.tsv', 'leagueoflegends.tsv', 'Libertarian.tsv', 'magicTCG.tsv', 'MakeupAddiction.tsv', 'Naruto.tsv', 'nba.tsv', 'oculus.tsv', 'OkCupid.tsv', 'Parenting.tsv','pathofexile.tsv', 'raisedbynarcissists.tsv', 'Random_Acts_Of_Amazon.tsv', 'science.tsv', 'Seattle.tsv', 'TalesFromRetail.tsv', 'talesfromtechsupport.tsv', 'ultrahardcore.tsv', 'videos.tsv', 'Warthunder.tsv', 'whowouldwin.tsv', 'xboxone.tsv', 'yugioh.tsv']

dataframes = []

for file_name in file_names:
    df = pd.read_csv(file_name, sep = '\t')
    df.columns = ['Words', 'Mean Score', 'Standard Score']
    dataframes.append(df)

subreddit_df = pd.concat(dataframes)
print(subreddit_df)

          Words  Mean Score  Standard Score
0        shitty       -6.04            1.41
1      pathetic       -5.80            1.50
2          suck       -5.75            1.49
3        stupid       -5.72            1.37
4          lazy       -5.64            1.34
...         ...         ...             ...
4642        mat        3.62            1.26
4643    artwork        3.66            1.12
4644        art        3.71            1.18
4645       sexy        3.81            1.08
4646  beautiful        3.82            0.69

[222079 rows x 3 columns]


In [3]:
#adjectives
adj_df = pd.read_csv('2000_adj.tsv', sep = '\t')
adj_df.columns = ['Words', 'Mean Score', 'Standard Score']
print(adj_df)

          Words  Mean Score  Standard Score
0        tragic       -3.15            0.66
1          ugly       -3.10            0.49
2     traumatic       -3.08            0.68
3      stinging       -3.05            0.56
4         awful       -2.89            0.49
...         ...         ...             ...
2036   gorgeous        2.73            0.68
2037     serene        2.81            0.44
2038       fine        2.82            0.44
2039  wonderful        2.91            0.41
2040  beautiful        3.07            0.54

[2041 rows x 3 columns]


In [4]:
#frequency
freq_df = pd.read_csv('2000_freq.tsv', sep = '\t')
freq_df.columns = ['Words', 'Mean Score', 'Standard Score']
print(freq_df)

          Words  Mean Score  Standard Score
0       painful       -3.69            1.53
1        intent       -3.49            1.67
2      terrible       -3.38            1.55
3         drunk       -3.28            1.16
4       tragedy       -3.26            1.48
...         ...         ...             ...
4918  perfectly        2.69            0.83
4919   romantic        2.70            0.76
4920   delicate        2.72            0.93
4921  beautiful        2.73            0.69
4922  wonderful        2.76            0.71

[4923 rows x 3 columns]


In [5]:
#Making lexicons into negative, neutral and positive
subreddit_df['Bins'] = pd.cut(subreddit_df['Mean Score'], bins=3, labels=['neg', 'nue', 'pos'])
adj_df['Bins']  = pd.cut(adj_df['Mean Score'], bins=3, labels=['neg', 'nue', 'pos'])
freq_df['Bins']  = pd.cut(freq_df['Mean Score'], bins=3, labels=['neg', 'nue', 'pos'])

#print(freq_df['Bins'])

In [6]:
# Loading data
def loaddata(filename):
    with open(filename, 'r', encoding = 'utf-8') as file:
        return file.readlines()

In [7]:
#Loading train data
train_tweets = loaddata('train_text.txt')

train_text = pd.DataFrame({"Tweets":train_tweets})

train_labels = pd.read_csv('train_labels.txt', sep='\t', names=["Labels"])

train_df = pd.concat([train_text["Tweets"], train_labels["Labels"] ], axis = 1)
train_df

Unnamed: 0,Tweets,Labels
0,"""QT @user In the original draft of the 7th boo...",2
1,"""Ben Smith / Smith (concussion) remains out of...",1
2,Sorry bout the stream last night I crashed out...,1
3,Chase Headley's RBI double in the 8th inning o...,1
4,@user Alciato: Bee will invest 150 million in ...,2
...,...,...
45610,"@user \""""So amazing to have the beautiful Lady...",2
45611,"9 September has arrived, which means Apple's n...",2
45612,Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...,2
45613,@user no I'm in hilton head till the 8th lol g...,1


In [8]:
#Loading validation data
val_tweets = loaddata('val_text.txt')

val_text = pd.DataFrame({"Tweets":val_tweets})
val_labels = pd.read_csv('val_labels.txt', sep='\t', names=["Labels"])

val_df = pd.concat([val_text["Tweets"], val_labels["Labels"] ], axis = 1)
val_df

Unnamed: 0,Tweets,Labels
0,Dark Souls 3 April Launch Date Confirmed With ...,1
1,"""National hot dog day, national tequila day, t...",2
2,When girls become bandwagon fans of the Packer...,0
3,@user I may or may not have searched it up on ...,1
4,Here's your starting TUESDAY MORNING Line up a...,1
...,...,...
1995,"""LONDON (AP) """" Prince George celebrates his s...",1
1996,Harper's Worst Offense against Refugees may be...,1
1997,Hold on... Sam Smith may do the theme to Spect...,2
1998,Gonna watch Final Destination 5 tonight. I alw...,1


In [9]:
#Loading test data
test_tweets = loaddata('test_text.txt')

test_text = pd.DataFrame({"Tweets":test_tweets})
test_labels = pd.read_csv('test_labels.txt', sep='\t', names=["Labels"])

test_df = pd.concat([test_text["Tweets"], test_labels["Labels"] ], axis = 1)
test_df

Unnamed: 0,Tweets,Labels
0,@user @user what do these '1/2 naked pics' hav...,1
1,OH: “I had a blue penis while I was this” [pla...,1
2,"@user @user That's coming, but I think the vic...",1
3,I think I may be finally in with the in crowd ...,2
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",0
...,...,...
12279,Sentinel Editorial: FBI’s Comey ‘had no one of...,1
12280,perfect pussy clips #vanessa hudgens zac efron...,1
12281,#latestnews 4 #newmexico #politics + #nativeam...,1
12282,Trying to have a conversation with my dad abou...,0


In [10]:
#Cleaning data
stop_words = set(stopwords.words('english'))

def tokenize_and_clean(tweet):
    tweet = str(tweet).lower()
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tokens = nltk.word_tokenize(tweet)
    clean_tokens = [word for word in tokens if word not in stop_words]
    
    return clean_tokens

In [11]:
train_df["cleaned_tweets"] = train_df["Tweets"].apply(tokenize_and_clean)
val_df["cleaned_tweets"] = val_df["Tweets"].apply(tokenize_and_clean)
test_df["cleaned_tweets"] = test_df["Tweets"].apply(tokenize_and_clean)

#print(test_df)

In [12]:
#Extracting Features
def FeatureExtractor_pos(tweet, lexicon_type):    
    pos_count = 0
    for word in tweet:
        if word in lexicon_type['Words']:
            pos_count = pos_count + 1            
    return pos_count

def FeatureExtractor_neu(tweet, lexicon_type):    
    neu_count = 0
    for word in tweet:
        if word in lexicon_type['Words']:
            neu_count += 1
    return neu_count   

def FeatureExtractor_neg(tweet, lexicon_type):  
    neg_count = 0
    for word in tweet:
        if word in lexicon_type['Words']:
            neg_count += 1
    return neg_count

In [13]:
#Extracting positive, neutral, negative Subreddit features
pos_subreddit = subreddit_df[subreddit_df['Bins']=="pos"].to_dict(orient='list')
neu_subreddit = subreddit_df[subreddit_df['Bins']=="nue"].to_dict(orient='list')
neg_subreddit = subreddit_df[subreddit_df['Bins']=="neg"].to_dict(orient='list')

print(pos_subreddit)

train_df['pos_subreddit'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_subreddit))
train_df['neu_subreddit'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_subreddit))
train_df['neg_subreddit'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_subreddit))

print(train_df)

val_df['pos_subreddit'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_subreddit))
val_df['neu_subreddit'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_subreddit))
val_df['neg_subreddit'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_subreddit))

test_df['pos_subreddit'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_subreddit))
test_df['neu_subreddit'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_subreddit))
test_df['neg_subreddit'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_subreddit))


{'Words': ['ar', 'wind', 'well', 'friend', 'germany', 'emulator', 'steamworld', 'whoever', 'someone', 'recommend', 'icarus', 'ports', 'pinball', 'originals', 'addictive', 'ps1', 'console', 'ds', 'spikes', 'buy', 'emulating', 'preferred', 'tennis', 'hd', 'scribblenauts', 'tokyo', 'enjoying', 'collector', 'inbox', 'disney', 'countries', 'emulate', 'oot', 'quick', 'sm3dl', 'canada', 'scammed', 'unused', 'desperate', 'challenging', 'emulated', 'wiiu', 'world', 'legend', 'numbers', 'smashing', 'which', 'surprisingly', 'message', 'songs', 'grew', 'imo', '1001', 'baby', 'emulation', 'stranger', 'chosen', 'want', 'crossed', 'between', 'greatly', 'gbc', 'kingdom', 'effects', 'art', 'ill', 'gift', 'virtual', 'vc', 'gear', 'genesis', 'trade', 'melee', 'exchange', 'won', 'perfect', 'metal', 'accurate', 'welcome', 'hearts', 'deal', 'style', 'complex', 'luigis', 'environments', 'nostalgia', 'tour', 'majoras', 'aus', 'hello', '2', 'kid', '60fps', 'gameboy', 'worlds', 'sunshine', 'eater', 'soooo', 'ex

In [14]:
#Extracting positive, neutral, negative adjective features
pos_adj = adj_df[adj_df['Bins']=="pos"].to_dict(orient='list')
neu_adj = adj_df[adj_df['Bins']=="nue"].to_dict(orient='list')
neg_adj = adj_df[adj_df['Bins']=="neg"].to_dict(orient='list')

#print(neg_subreddit)

train_df['pos_adj'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_adj))
train_df['neu_adj'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_adj))
train_df['neg_adj'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_adj))


val_df['pos_adj'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_adj))
val_df['neu_adj'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_adj))
val_df['neg_adj'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_adj))

test_df['pos_adj'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_adj))
test_df['neu_adj'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_adj))
test_df['neg_adj'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_adj))

print(train_df)

                                                  Tweets  Labels  \
0      "QT @user In the original draft of the 7th boo...       2   
1      "Ben Smith / Smith (concussion) remains out of...       1   
2      Sorry bout the stream last night I crashed out...       1   
3      Chase Headley's RBI double in the 8th inning o...       1   
4      @user Alciato: Bee will invest 150 million in ...       2   
...                                                  ...     ...   
45610  @user \""So amazing to have the beautiful Lady...       2   
45611  9 September has arrived, which means Apple's n...       2   
45612  Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...       2   
45613  @user no I'm in hilton head till the 8th lol g...       1   
45614  WASHINGTON (Reuters) - U.S. Vice President Joe...       1   

                                          cleaned_tweets  pos_subreddit  \
0      [qt, user, original, draft, 7th, book, remus, ...              8   
1      [ben, smith, smith, concus

In [15]:
#Extracting positive, neutral, negative adjective features
pos_freq = freq_df[freq_df['Bins']=="pos"].to_dict(orient='list')
neu_freq = freq_df[freq_df['Bins']=="nue"].to_dict(orient='list')
neg_freq = freq_df[freq_df['Bins']=="neg"].to_dict(orient='list')

#print(neg_subreddit)

train_df['pos_freq'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_freq))
train_df['neu_freq'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_freq))
train_df['neg_freq'] = train_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_freq))


val_df['pos_freq'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_freq))
val_df['neu_freq'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_freq))
val_df['neg_freq'] = val_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_freq))

test_df['pos_freq'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_pos(x, lexicon_type = pos_freq))
test_df['neu_freq'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neu(x, lexicon_type = neu_freq))
test_df['neg_freq'] = test_df["cleaned_tweets"].apply(lambda x: FeatureExtractor_neg(x, lexicon_type = neg_freq))

print(train_df)

                                                  Tweets  Labels  \
0      "QT @user In the original draft of the 7th boo...       2   
1      "Ben Smith / Smith (concussion) remains out of...       1   
2      Sorry bout the stream last night I crashed out...       1   
3      Chase Headley's RBI double in the 8th inning o...       1   
4      @user Alciato: Bee will invest 150 million in ...       2   
...                                                  ...     ...   
45610  @user \""So amazing to have the beautiful Lady...       2   
45611  9 September has arrived, which means Apple's n...       2   
45612  Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...       2   
45613  @user no I'm in hilton head till the 8th lol g...       1   
45614  WASHINGTON (Reuters) - U.S. Vice President Joe...       1   

                                          cleaned_tweets  pos_subreddit  \
0      [qt, user, original, draft, 7th, book, remus, ...              8   
1      [ben, smith, smith, concus

In [16]:
#log of word count
def log_wordcount(tweet):
    if len(tweet) > 0:
        word_count = len(tweet)  
    else:
        word_count = 1e-10
    return np.log(word_count)

train_df["log_wordcount"] = train_df["cleaned_tweets"].apply(log_wordcount)
val_df["log_wordcount"] = val_df["cleaned_tweets"].apply(log_wordcount)  
test_df["log_wordcount"] = test_df["cleaned_tweets"].apply(log_wordcount)
  
print(train_df)

                                                  Tweets  Labels  \
0      "QT @user In the original draft of the 7th boo...       2   
1      "Ben Smith / Smith (concussion) remains out of...       1   
2      Sorry bout the stream last night I crashed out...       1   
3      Chase Headley's RBI double in the 8th inning o...       1   
4      @user Alciato: Bee will invest 150 million in ...       2   
...                                                  ...     ...   
45610  @user \""So amazing to have the beautiful Lady...       2   
45611  9 September has arrived, which means Apple's n...       2   
45612  Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...       2   
45613  @user no I'm in hilton head till the 8th lol g...       1   
45614  WASHINGTON (Reuters) - U.S. Vice President Joe...       1   

                                          cleaned_tweets  pos_subreddit  \
0      [qt, user, original, draft, 7th, book, remus, ...              8   
1      [ben, smith, smith, concus

In [17]:
#log of the length of the longest word
def log_length_longestword(tweet):
    max_word_length = 0
    for word in tweet:
        word_length = len(word)
        if word_length > max_word_length:
            max_word_length = word_length
        else:
            max_word_length = 1e-10
    return np.log(max_word_length)

train_df["log_length_longestword"] = train_df["cleaned_tweets"].apply(log_length_longestword)
val_df["log_length_longestword"] = val_df["cleaned_tweets"].apply(log_length_longestword)  
test_df["log_length_longestword"] = test_df["cleaned_tweets"].apply(log_length_longestword)
  
print(train_df)

                                                  Tweets  Labels  \
0      "QT @user In the original draft of the 7th boo...       2   
1      "Ben Smith / Smith (concussion) remains out of...       1   
2      Sorry bout the stream last night I crashed out...       1   
3      Chase Headley's RBI double in the 8th inning o...       1   
4      @user Alciato: Bee will invest 150 million in ...       2   
...                                                  ...     ...   
45610  @user \""So amazing to have the beautiful Lady...       2   
45611  9 September has arrived, which means Apple's n...       2   
45612  Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...       2   
45613  @user no I'm in hilton head till the 8th lol g...       1   
45614  WASHINGTON (Reuters) - U.S. Vice President Joe...       1   

                                          cleaned_tweets  pos_subreddit  \
0      [qt, user, original, draft, 7th, book, remus, ...              8   
1      [ben, smith, smith, concus

In [18]:
#log of count of words with 5 or more characters
def log_count_words_fiveormore(tweet):
    count_long_words = 0
    for word in tweet:
        if len(word) >= 5:
            count_long_words += 1
        else:
            count_long_words = 1e-10
    return np.log(count_long_words)

train_df["log_count_words_fiveormore"] = train_df["cleaned_tweets"].apply(log_count_words_fiveormore)
val_df["log_count_words_fiveormore"] = val_df["cleaned_tweets"].apply(log_count_words_fiveormore)  
test_df["log_count_words_fiveormore"] = test_df["cleaned_tweets"].apply(log_count_words_fiveormore)
  
print(train_df)

                                                  Tweets  Labels  \
0      "QT @user In the original draft of the 7th boo...       2   
1      "Ben Smith / Smith (concussion) remains out of...       1   
2      Sorry bout the stream last night I crashed out...       1   
3      Chase Headley's RBI double in the 8th inning o...       1   
4      @user Alciato: Bee will invest 150 million in ...       2   
...                                                  ...     ...   
45610  @user \""So amazing to have the beautiful Lady...       2   
45611  9 September has arrived, which means Apple's n...       2   
45612  Leeds 1-1 Sheff Wed. Giuseppe Bellusci securin...       2   
45613  @user no I'm in hilton head till the 8th lol g...       1   
45614  WASHINGTON (Reuters) - U.S. Vice President Joe...       1   

                                          cleaned_tweets  pos_subreddit  \
0      [qt, user, original, draft, 7th, book, remus, ...              8   
1      [ben, smith, smith, concus

In [19]:
#Normlaizing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

x_train = train_df.drop(["Tweets","Labels","cleaned_tweets"],axis=1)
y_train = train_df['Labels']
x_train_scaled = scaler.fit_transform(x_train)

x_val = val_df.drop(["Tweets","Labels","cleaned_tweets"],axis=1)
y_val = val_df['Labels']
x_val_scaled = scaler.fit_transform(x_val)

x_test = test_df.drop(["Tweets","Labels","cleaned_tweets"],axis=1)
y_test = test_df['Labels']
x_test_scaled = scaler.fit_transform(x_test)


In [20]:
#Applying Logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

LR = LogisticRegression(multi_class = 'multinomial', solver = 'saga')
LR.fit(x_train_scaled, y_train)

y_val_pred = LR.predict(x_val_scaled)
acc = accuracy_score(y_val, y_val_pred)
print("Accuracy of validation dataset: {:.2f}%".format(acc * 100))

f1 = f1_score(y_val, y_val_pred, average = 'weighted')
print("F1 score of validation dataset:{:.2f}".format(f1))


y_test_pred = LR.predict(x_test_scaled)
acc = accuracy_score(y_test, y_test_pred)
print("Accuracy of test dataset: {:.2f}%".format(acc * 100))

f1 = f1_score(y_test, y_test_pred, average = 'weighted')
print("F1 score of test dataset:{:.2f}".format(f1))


Accuracy of validation dataset: 50.85%
F1 score of validation dataset:0.50
Accuracy of test dataset: 50.90%
F1 score of test dataset:0.45
