# **Import necessary libraries**

In [1]:
# Standard packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# nltk for preprocessing of text data
import nltk
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

# sklearn for preprocessing and machine learning models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



# **Load data**

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

# **Split data to X and y variables**

In [3]:
y = train['sentiment']
X = train['message']

# **Turning text into something your model can read**

In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

# Splitting the training data into a training and validation set

In [5]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.3,shuffle=True, 
                                                stratify=y, random_state=50)

# **Training the model and evaluating using the validation set**

In [6]:
svc = SVC(kernel='linear')
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_val)

In [7]:
f1_score(y_val, svc_pred, average="macro")

0.6547616564927567

# **Getting our test set ready**

In [8]:
testx = test['message']
test_vect = vectorizer.transform(testx)

# **Making predictions on the test set and adding a sentiment column to our original test df**

In [9]:
y_pred = svc.predict(test_vect)

In [10]:
test['sentiment'] = y_pred

In [11]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


# **Creating an output csv for submission**

In [12]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)