# **Import necessary libraries**

In [1]:
# Standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from gensim.parsing.preprocessing import remove_stopwords

# nltk for preprocessing of text data
import nltk
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import TreebankWordTokenizer, SnowballStemmer

# sklearn for preprocessing and machine learning models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# **Load data**

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


# **Preprocessing**

In [3]:
# Remove stopwords

def stopword_removal(row):
    message = row['message']
    message = remove_stopwords(message)
    return message

train['message'] = train.apply(stopword_removal, axis=1)
test['message'] = test.apply(stopword_removal, axis=1)

train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's like lack evidence anthropogenic global w...,126103
2,2,RT @RawStory: Researchers years act climate ch...,698562
3,1,#TodayinMaker# WIRED : 2016 pivotal year war c...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, racist, sexist...",466954


**Creating a bag of words and assigning our stemmer and lemmatizer**

In [4]:
# Remove extra white spaces, punctuation and apply lower casing

train['message'] = train['message'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
test['message'] = test['message'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
test.head()

Unnamed: 0,message,tweetid
0,europe looking china sure fighting climate cha...,169760
1,combine polling staffers climate change womens...,35326
2,the scary unimpeachable evidence climate chang...,224985
3,karoli morgfair osborneink dailykos putin got...,476263
4,rt fakewillmoore female orgasms cause global w...,872928


# **Split data to X and y variables**

In [5]:
y = train['sentiment']
X = train['message']

# **Turning text into something your model can read**

In [6]:
vectorizer = TfidfVectorizer(strip_accents='unicode', ngram_range=(1, 2), min_df=2, sublinear_tf=True)
X_vectorized = vectorizer.fit_transform(X)

# Splitting the training data into a training and validation set

In [7]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.2,shuffle=True, 
                                                stratify=y, random_state=50)

# **Training the model and evaluating using the validation set**

In [8]:
svc = SVC(kernel="linear")
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_val)

In [9]:
f1_score(y_val, svc_pred, average="macro")

0.665689312506114

# **Getting our test set ready**

In [10]:
testx = test['message']
test_vect = vectorizer.transform(testx)

# **Making predictions on the test set and adding a sentiment column to our original test df**

In [11]:
y_pred = svc.predict(test_vect)

In [12]:
test['sentiment'] = y_pred

In [13]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,europe looking china sure fighting climate cha...,169760,1
1,combine polling staffers climate change womens...,35326,1
2,the scary unimpeachable evidence climate chang...,224985,1
3,karoli morgfair osborneink dailykos putin got...,476263,1
4,rt fakewillmoore female orgasms cause global w...,872928,0


# **Creating an output csv for submission**

In [14]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)