In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# utilities
import re
import string
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

# nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet


# sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

#for word embedding
import gensim
from gensim.models import Word2Vec #Word2Vec is mostly used for huge datasets

In [5]:
train = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/train.csv')
test = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/test.csv')
sample = pd.read_csv('../input/edsa-climate-change-belief-analysis-2021/sample_submission.csv')

In [6]:
train.head()

In [7]:
train.shape

In [8]:
#1. WORD-COUNT
train['word_count'] = train['message'].apply(lambda x: len(str(x).split()))
print("WORD-COUNT_of_positive_tweets")
print(train[train['sentiment']==1]['word_count'].mean()) #Positive tweets
print("WORD-COUNT_of_Neutral_tweets")
print(train[train['sentiment']==0]['word_count'].mean()) #Neutral tweets
print("WORD-COUNT_of_Negative_tweets")
print(train[train['sentiment']==-1]['word_count'].mean()) #Negative tweets
#Disaster tweets are more wordy than the non-disaster tweets

#2. CHARACTER-COUNT
train['char_count'] = train['message'].apply(lambda x: len(str(x)))
print("CHARACTER-COUNT_of_positive_tweets")
print(train[train['sentiment']==1]['char_count'].mean()) #Positive tweets
print("CHARACTER-COUNT_of_Neutral_tweets")
print(train[train['sentiment']==0]['word_count'].mean()) #Neutral tweets
print("CHARACTER-COUNT_of_Negative_tweets")
print(train[train['sentiment']==-1]['word_count'].mean()) #Negative tweets
#Disaster tweets are longer than the non-disaster tweets

#3. UNIQUE WORD-COUNT
train['unique_word_count'] = train['message'].apply(lambda x: len(set(str(x).split())))
print("UNIQUE WORD-COUNT_of_positive_tweets")
print(train[train['sentiment']==1]['unique_word_count'].mean()) #Positive tweets
print("UNIQUE WORD-COUNT_of_Neutral_tweets")
print(train[train['sentiment']==0]['unique_word_count'].mean()) #Neutral tweets
print("UNIQUE WORD-COUNT_of_Negative_tweets")
print(train[train['sentiment']==0]['unique_word_count'].mean()) #Negative tweets

In [9]:
#Plotting word-count per tweet
fig,(ax1,ax2,ax3)=plt.subplots(1,3,figsize=(15,8))
train_words=train[train['sentiment']==1]['word_count']
ax1.hist(train_words,color='red')
ax1.set_title('Positive tweets')
train_words=train[train['sentiment']==0]['word_count']
ax2.hist(train_words,color='green')
ax2.set_title('Neutral tweets')
train_words=train[train['sentiment']==-1]['word_count']
ax3.hist(train_words,color='blue')
ax3.set_title('Negative tweets')
fig.suptitle('Words per tweet')
plt.show()

In [10]:
train.info

In [11]:
train.dtypes

In [12]:
train['sentiment'].unique()

In [13]:
train['sentiment'].nunique()

In [14]:
import seaborn as sns
sns.countplot(x = 'sentiment', data = train)

In [15]:
train['message'] = train['message'].str.lower()
train['message'].tail()

In [16]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
train['message'] = train['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [17]:
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

In [18]:
#3. LEMMATIZATION
# it is the process of reducing the word to its base form

# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
# Full list is available here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [19]:

#FINAL PREPROCESSING
def finalpreprocess(string):
    return lemmatizer(stopword(string))

train['clean_message'] = train['message'].apply(lambda x: finalpreprocess(x))
train = train.drop(columns = ['word_count','char_count','unique_word_count'])
train.head()

In [21]:
# create Word2vec model
#here words_f should be a list containing words from each document. say 1st row of the list is words from the 1st document/sentence
#length of words_f is number of documents/sentences in your dataset
train['clean_message_tok']=[nltk.word_tokenize(i) for i in train['clean_message']] #convert preprocessed sentence to tokenized sentence
model = Word2Vec(train['clean_message_tok'],min_count=1)  #min_count=1 means word should be present at least across all documents,
#if min_count=2 means if the word is present less than 2 times across all the documents then we shouldn't consider it


w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))  #combination of word and its vector

#for converting sentence to vectors/numbers from word vectors result by Word2Vec
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])


In [22]:
#SPLITTING THE TRAINING DATASET INTO TRAINING AND VALIDATION
 
# Input: "reviewText", "rating" and "time"
# Target: "log_votes"
X_train, X_val, y_train, y_val = train_test_split(train["clean_message"],
                                                  train["sentiment"],
                                                  test_size=0.2,
                                                  shuffle=True)
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  #for word2vec
X_val_tok= [nltk.word_tokenize(i) for i in X_val]      #for word2vec

#TF-IDF
# Convert x_train to vector since model can only run on numbers and not words- Fit and transform
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) #tfidf runs on non-tokenized sentences unlike word2vec
# Only transform x_test (not fit and transform)
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val) #Don't fit() your TfidfVectorizer to your test data: it will 
#change the word-indexes & weights to match test data. Rather, fit on the training data, then use the same train-data-
#fit model on the test data, to reflect the fact you're analyzing the test data only based on what was learned without 
#it, and the have compatible


#Word2vec
# Fit and transform
modelw = MeanEmbeddingVectorizer(w2v)
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_val_tok)


In [23]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)

lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_val_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 

print(classification_report(y_val,y_predict))

In [24]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
#It's a probabilistic classifier that makes use of Bayes' Theorem, a rule that uses probability to make predictions based on prior knowledge of conditions that might be related. This algorithm is the most suitable for such large dataset as it considers each feature independently, calculates the probability of each category, and then predicts the category with the highest probability.

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_val_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_val_vectors_tfidf)[:,1]
 

print(classification_report(y_val,y_predict))
print('Confusion Matrix:',confusion_matrix(y_val, y_predict))

In [25]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression (W2v)
lr_w2v=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v.fit(X_train_vectors_w2v, y_train)  #model

#Predict y value for test dataset
y_predict = lr_w2v.predict(X_val_vectors_w2v)
y_prob = lr_w2v.predict_proba(X_val_vectors_w2v)[:,1]
 

print(classification_report(y_val,y_predict))
print('Confusion Matrix:',confusion_matrix(y_val, y_predict))

In [27]:
df_test = test
df_test['clean_message'] = test['message'].apply(lambda x: finalpreprocess(x)) #preprocess the data
X_test=df_test['clean_message'] 
X_vector=tfidf_vectorizer.transform(X_test) #converting X_test to vector
y_predict = lr_tfidf.predict(X_vector)      #use the trained model on X_vector
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]
df_test['sentiment']= y_predict
print(df_test.head())

In [28]:
test[['tweetid', 'sentiment']].to_csv('testsubmission.csv', index = False)