## Classification Predict 

#### Import all necessary packages:

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#### Load data from kaggle:

In [2]:
train = pd.read_csv('https://raw.githubusercontent.com/Iamsim21/Classification_Predict/main/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Iamsim21/Classification_Predict/main/test.csv')

In [3]:
train.head(5)

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [4]:
test.head(5)

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [5]:
from sklearn.utils import resample 

yes = train[train['sentiment'] == 1]
no = train[train['sentiment'] == -1]
neutral = train[train['sentiment'] == 0]
news = train[train['sentiment'] == 2]

no_unsampled = resample(no, replace=True, n_samples=len(yes), random_state=27)
ne_unsampled = resample(neutral, replace=True, n_samples=len(yes), random_state=27)
nw_unsampled = resample(news, replace=True, n_samples=len(yes), random_state=27)

unsampled = pd.concat([yes, no_unsampled, ne_unsampled, nw_unsampled])

In [6]:
unsampled.head(5)

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954
5,1,Worth a read whether you do or don't believe i...,425577


#### Finding target variables:

In [7]:
y = unsampled['sentiment']
X = unsampled['message']

#### Turning text into something readable for the model:

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=2, stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

#### Splitting training data into a training and validation set:

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.25, shuffle=True, random_state=11)

#### Train the model and evaluate it using the validation set:

In [10]:
# Random Forest Classifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

#### Check performance of model on the validation set:

In [11]:
f1_score(y_test, rfc_pred, average='macro')

0.9553488130032157

#### Get test set ready:

In [12]:
x_test = test['message']
v_test = vectorizer.transform(x_test)

#### Make prediction on the test set and add a sentiment column to our orignal test dataframe:

In [13]:
y_pred = rfc.predict(v_test)

In [14]:
test['sentiment'] = y_pred

In [15]:
test.head(5)

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


Create an output csv submission:

In [16]:
test[['tweetid', 'sentiment']].to_csv('testsubmission.csv', index=False)