## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

First we need to check how many values of each sentiment we have to see if the data is balanced.

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

Let's see how our data is structured.

In [4]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


# Balance data

In [5]:
from sklearn.utils import resample
believe = train[train['sentiment'] == 1]
no_belief = train[train['sentiment']== -1]
neutral = train[train['sentiment']== 0]
news = train[train['sentiment']== 2]

In [6]:
#Upsample minority
no_belief_upsampled = resample(no_belief, replace=True, n_samples = len(believe), random_state=27)
neutral_upsampled = resample(neutral, replace=True, n_samples = len(believe), random_state=27)
news_upsampled = resample(news, replace=True, n_samples = len(believe), random_state=27)

In [7]:
#Combine majority and upsampled minority
upsampled =pd.concat([believe, no_belief_upsampled, neutral_upsampled, news_upsampled])

In [8]:
upsampled.sentiment.value_counts()

-1    8530
 2    8530
 1    8530
 0    8530
Name: sentiment, dtype: int64

# Data Cleaning

In [9]:
#Remove all websites and replce them with the text 'web_url'
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
upsampled['message'] = upsampled['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [10]:
#make all lower case
upsampled['message'] = upsampled['message'].str.lower()

In [11]:
#Remove punctuation and numbers
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])
upsampled['message'] = upsampled['message'].apply(remove_punctuation_numbers)

In [12]:
upsampled.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dio...,625221
1,1,its not like we lack evidence of anthropogenic...,126103
3,1,todayinmaker wired was a pivotal year in the...,573736
4,1,rt soynoviodetodas its and a racist sexist cl...,466954
5,1,worth a read whether you do or dont believe in...,425577


That looks much better. Now to begin preparing our data for our model.

## Splitting out the X variable from the target

In [13]:
y = upsampled['sentiment']
X = upsampled['message']

Let's create a stemmer and tokenizer to make the text more readable for our model.

In [14]:
from nltk.stem import PorterStemmer

# init stemmer
porter_stemmer=PorterStemmer()

def text_preprocessor(text):
    
    text=text.lower() 
    text=re.sub("\\W"," ",text) # remove special chars
    text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    
    # stem words
    words=re.split("\\s+",text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)

In [15]:
def my_tokenizer(text):
    # create a space between special characters 
    text=re.sub("(\\W)"," \\1 ",text)

    # split based on whitespace
    return re.split("\\s+",text)

## Turning text into something your model can read

In [16]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),tokenizer=my_tokenizer, min_df=2,max_df=0.80,analyzer='word',smooth_idf=False, preprocessor=text_preprocessor,stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

  'stop_words.' % sorted(inconsistent))


## Splitting the training data into a training and validation set

In [17]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.30,shuffle=True, random_state=25)

## Training the model and evaluating using the validation set 

In [18]:
lsvc = LinearSVC()
lsvc.fit(X_train, y_train)
lsvc_pred = lsvc.predict(X_val)

## Checking the performance of our model on the validation set

In [19]:
f1_score(y_val, lsvc_pred, average="macro")

0.9506267180466148

In [20]:
from sklearn import metrics

print(metrics.classification_report(y_val, lsvc_pred))

              precision    recall  f1-score   support

          -1       0.98      0.99      0.98      2552
           0       0.95      0.96      0.96      2601
           1       0.94      0.89      0.91      2553
           2       0.93      0.97      0.95      2530

    accuracy                           0.95     10236
   macro avg       0.95      0.95      0.95     10236
weighted avg       0.95      0.95      0.95     10236



## Getting our test set ready 

In [21]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [22]:
y_pred = lsvc.predict(test_vect)

In [23]:
test['sentiment'] = y_pred

In [24]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


## Creating an output csv for submission

In [25]:
test[['tweetid','sentiment']].to_csv('testsubmission_8.csv', index=False)