In [1]:
# Import all necessary packages
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load data:
train = pd.read_csv('https://raw.githubusercontent.com/Iamsim21/Classification_Predict/main/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Iamsim21/Classification_Predict/main/test.csv')

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
train['message'] = train['message'].apply(word_tokenize)

test['message'] = test['message'].apply(word_tokenize)

In [5]:
from nltk.tokenize import RegexpTokenizer

In [6]:
train['message'] = train['message'].astype(str)
test['message'] = test['message'].astype(str)

In [7]:
train['message'] = train['message'].apply(lambda x: RegexpTokenizer("[\w+.]+").tokenize(x))
test['message'] = test['message'].apply(lambda x: RegexpTokenizer("[\w+.]+").tokenize(x))

In [8]:
message_train = train['message']
message_train

0        [PolySciMajor, EPA, chief, does, n, t, think, ...
1        [It, s, not, like, we, lack, evidence, of, ant...
2        [RT, RawStory, Researchers, say, we, have, thr...
3        [TodayinMaker, WIRED, 2016, was, a, pivotal, y...
4        [RT, SoyNovioDeTodas, It, s, 2016, and, a, rac...
                               ...                        
15814    [RT, ezlusztig, They, took, down, the, materia...
15815    [RT, washingtonpost, How, climate, change, cou...
15816    [notiven, RT, nytimesworld, What, does, Trump,...
15817    [RT, sara8smiles, Hey, liberals, the, climate,...
15818    [RT, Chet_Cannon, ., kurteichenwald, s, climat...
Name: message, Length: 15819, dtype: object

In [9]:
message_test = test['message']
message_test

0        [Europe, will, now, be, looking, to, China, to...
1        [Combine, this, with, the, polling, of, staffe...
2        [The, scary, unimpeachable, evidence, that, cl...
3        [Karoli, morgfair, OsborneInk, dailykos, Putin...
4        [RT, FakeWillMoore, Female, orgasms, cause, gl...
                               ...                        
10541    [RT, BrittanyBohrer, Brb, writing, a, poem, ab...
10542    [2016, the, year, climate, change, came, home,...
10543    [RT, loop_vanuatu, Pacific, countries, positiv...
10544    [RT, xanria_00018, You, re, so, hot, you, must...
10545    [RT, chloebalaoing, climate, change, is, a, gl...
Name: message, Length: 10546, dtype: object

In [10]:
mtr = [' '.join(i) for i in message_train]
mts = [' '.join(i) for i in message_test]

In [11]:
train['message'] = pd.DataFrame(mtr, columns = ['message'])
test['message'] = pd.DataFrame(mts, columns = ['message'])

In [12]:
# Resample data
from sklearn.utils import resample 

yes = train[train['sentiment'] == 1]
no = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
news = train[train['sentiment'] == 2]

no_unsampled = resample(no, replace=True, n_samples=len(yes), random_state=27)
ne_unsampled = resample(neutral, replace=True, n_samples=len(yes), random_state=27)
nw_unsampled = resample(news, replace=True, n_samples=len(yes), random_state=27)

unsampled = pd.concat([yes, no_unsampled, ne_unsampled, nw_unsampled])

In [13]:
unsampled.head(5)

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief does n t think carbon d...,625221
1,1,It s not like we lack evidence of anthropogeni...,126103
3,1,TodayinMaker WIRED 2016 was a pivotal year in ...,573736
4,1,RT SoyNovioDeTodas It s 2016 and a racist sexi...,466954
5,1,Worth a read whether you do or do n t believe ...,425577


In [14]:
# Apply variables
y = unsampled['sentiment']
X = unsampled['message']

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2, stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [16]:
# Split train dataset into test and validation set: 
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, shuffle=True, random_state=47)

In [32]:
# Load logistic regression model
logreg = LogisticRegression(max_iter=1500, C=10)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

In [33]:
# Check f1-score:
f1_score(y_test, logreg_pred, average='macro')

0.9601487780749538

In [34]:
# Prepare test set
x_test = test['message']
v_test = vectorizer.transform(x_test)

#### Make prediction on the test set and add a sentiment column to our orignal test dataframe:

In [35]:
# Make a prediction on the test set and add a sentiment column 
y_pred = logreg.predict(v_test)

In [36]:
test['sentiment'] = y_pred

In [37]:
test.head(5)

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,The scary unimpeachable evidence that climate ...,224985,1
3,Karoli morgfair OsborneInk dailykos Putin got ...,476263,1
4,RT FakeWillMoore Female orgasms cause global w...,872928,1


Create an output csv submission:

In [38]:
test[['tweetid', 'sentiment']].to_csv('test_sub3.csv', index=False)