# **Import necessary libraries**

In [1]:
# Standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# nltk for preprocessing of text data
import nltk
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

# sklearn for preprocessing and machine learning models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



# **Load data**

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

# **Preprocessing**

In [3]:
# Remove punctuation on train and test dataset

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train["message"] = train["message"].apply(lambda text: remove_punctuation(text))
test["message"] = test["message"].apply(lambda text: remove_punctuation(text))



# **Split data to X and y variables**

In [4]:
y = train['sentiment']
X = train['message']

# **Turning text into something your model can read**

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

# Splitting the training data into a training and validation set

In [6]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.3,shuffle=True, 
                                                stratify=y, random_state=72)

# **Training the model and evaluating using the validation set**

In [7]:
svc = SVC(kernel='linear')
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_val)

In [8]:
f1_score(y_val, svc_pred, average="macro")

0.626447651884817

# **Getting our test set ready**

In [9]:
testx = test['message']
test_vect = vectorizer.transform(testx)

# **Making predictions on the test set and adding a sentiment column to our original test df**

In [10]:
y_pred = svc.predict(test_vect)

In [11]:
test['sentiment'] = y_pred

In [12]:
test.head(20)

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,The scary unimpeachable evidence that climate ...,224985,1
3,Karoli morgfair OsborneInk dailykos \nPutin go...,476263,1
4,RT FakeWillMoore Female orgasms cause global w...,872928,1
5,RT nycjim Trump muzzles employees of several g...,75639,1
6,bmastenbrook yes wrote that in 3rd yr Comp Sci...,211536,1
7,RT climatehawk1 Indonesian farmers weather cli...,569434,1
8,RT guardian British scientists face a ‘huge hi...,315368,2
9,Aid For Agriculture Sustainable agriculture a...,591733,1


# **Creating an output csv for submission**

In [13]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)