In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno # for missing data visualization
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for data visualization
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
import re
import emoji
import time


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **1. Visualizing Data**

In [2]:
train =  pd.read_csv("/kaggle/input/tweets/train.csv")
train.sample(8)

In [3]:
test =  pd.read_csv("/kaggle/input/tweets/test.csv")
test.sample(8)

In [4]:
print('Information of the training set')
# Show the information of test dataset
print(train.info())

In [5]:
print('Information of the testing set')
# Show the information of test dataset
print(test.info())

In [6]:
# Let's search the missing values in the training dataset
msno.matrix(df=train, figsize=(6,6), color=(0,0,0))

No missing values in the training dataset

In [7]:
train['Label'].value_counts()

It'll be better if we could get a relative percentage instead of the count. It is very simple with value_counts and can be achieved with a minor modification in the above code. 

In [8]:
train['Label'].value_counts(normalize=True)*100

In [9]:
# Visualizing target
plt.figure(figsize=(9,6))
sns.countplot(x='Label', data=train)

This is a better representation. We have almost the same number of tweets in "Politics" and "Sports" classes.

# **2. Preprocessing**

Before we start with any NLP project we need to pre-process the data to get it all in a consistent format.We need to clean, tokenize and convert our data into a matrix. Let's create functions which will perform the following tasks on the TweetText columns:

* Cleans tweets
* Tokenizes
* Removes stopwords



**Cleaning tweets**

In [10]:
# remove the hashtags, mentions and unwanted characters.
def clean_text(tweet):
    tweet = tweet.lower()
    tweet = re.sub("[0-9]+","",tweet) #Remove numerics.
    #tweet = re.sub(r'&amp+', '', tweet) 
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove mentions (@)
    tweet = re.sub("[^\w\s]","",tweet) #Remove punctuation
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text   
    tweet = re.sub(r'^rt[\s]+', '', tweet) # remove old style retweet text "RT"
    tweet = re.sub(r'amp[\s]+', '', tweet) 
    return tweet

clean_text(tweet) function applies a first round of text cleaning techniques.

In [11]:
train["TweetText"] = train["TweetText"].apply(lambda x: clean_text(x))
train.head()

In [12]:
test["TweetText"] = test["TweetText"].apply(lambda x: clean_text(x))
test.head()

**Tokenization**

In [13]:
train["TweetText"] = train["TweetText"].apply(lambda tweet: word_tokenize(tweet))
train.head()

In [14]:
test["TweetText"] = test["TweetText"].apply(lambda tweet: word_tokenize(tweet))
test.head()

**Remove Stop Words**

In [15]:
def remove_stopwords(tweet):
    stopwords = nltk.corpus.stopwords.words('english')
    tweet = [word for word in tweet if word not in stopwords]
    return tweet

The function remove_stopwords(tweet) then takes in the tokenized text and removes stop words.

In [16]:
train["TweetText"] = train["TweetText"].apply(lambda tweet: remove_stopwords(tweet))
train.head()

In [17]:
test["TweetText"] = test["TweetText"].apply(lambda tweet: remove_stopwords(tweet))
test.head()

**Lemmitization**

In [18]:
def lemmatization(tweet):
    lemmatizer = nltk.WordNetLemmatizer()
    tweet = [lemmatizer.lemmatize(word) for word in tweet]
    return tweet

In [19]:
train["TweetText"] = train["TweetText"].apply(lambda x: lemmatization(x))
train.head()

In [20]:
test["TweetText"] = test["TweetText"].apply(lambda x: lemmatization(x))
test.head()

Tweets after preprocessing

In [21]:
train.head()

In [22]:
 test.head()

# **3. Feature Extraction**

**Bag of Words**

In [23]:
# Creating the Bag of Words model 
def bag_of_words(tweet_text):
    BoW = {} 
    for tweet in range(len(tweet_text)): 
        for word in tweet_text[tweet]: 
            if word not in BoW.keys(): 
                BoW[word] = 1
            else: 
                BoW[word] += 1
    return BoW

In [24]:
train_BoW = bag_of_words(train["TweetText"])

In [25]:
test_BoW = bag_of_words(test["TweetText"])

In [26]:
def words_tab(df,keys):
    data_frame = pd.DataFrame(data=np.zeros((len(df), len(keys))), columns=keys)
    for i in df.index:
        for j in df[i]:
            data_frame[j][i]=1
    return data_frame

In [27]:
train_tab = words_tab(train['TweetText'],train_BoW.keys())

In [28]:
test_tab = words_tab(test['TweetText'],test_BoW.keys())

In [29]:
# add missing words from train data present in test data
for i in test_tab:
    if i not in train_tab:
        train_tab[i] = 0

# add missing words from test data present in train data
for i in train_tab:
    if i not in test_tab:
        test_tab[i] = 0

In [30]:
print(len(train_tab.iloc[0]))
print(len(test_tab.iloc[0]))

**TFIDF**

In [31]:
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(train_tab).toarray()
X

In [32]:
tfidfconverter = TfidfTransformer()
test = tfidfconverter.fit_transform(test_tab).toarray()
test

In [33]:
def text_to_plot(tweets):
    text_to_plt = ""
    for i in tweets:
        text = " ".join(i)
        text_to_plt = text_to_plt +" "+ text
    return text_to_plt

In [34]:
politics = train[train['Label'] == 'Politics']['TweetText']
politics_text = text_to_plot(politics)

In [35]:
sports = train[train['Label'] == 'Sports']['TweetText']
sports_text = text_to_plot(sports)

In [36]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[30, 15])
wordcloud1 = WordCloud( background_color='white',
                        #mask=transformed_mask,
                        width=600,
                        height=400).generate(politics_text)
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Politics Tweets',fontsize=30)

##############################################################

wordcloud2 = WordCloud( background_color='white',
                        #mask=transformed_mask,
                        width=600,
                        height=400).generate(sports_text)
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Sports Tweets',fontsize=30)

# **3. Model Training**

In [37]:
y = train.iloc[:, 1].values 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Random Forest**

In [38]:
RF_classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 100)
RF_classifier.fit(X_train, y_train)

**Naive Bayes**

In [39]:
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

# **4. Performance**

In [40]:
predictions = RF_classifier.predict(X_test)

In [41]:
print('Training Accuracy of Random Forest: %.3f'%RF_classifier.score(X_train, y_train))
accuracy_tfidf = accuracy_score(y_test, predictions)
print('Test Accuracy of Random Forest: %.3f'%accuracy_tfidf)

In [42]:
target_names = ['Politics','Sports']

def show_confusion_matrix(validations, predictions, titre):
    matrix = confusion_matrix(validations, predictions)
    plt.figure(figsize=(8, 8), dpi=50)
    sns.set(font_scale=1.6)#for label size
    sns.heatmap(matrix,
                #cmap="coolwarm",
                linecolor='white',
                linewidths=1,
                xticklabels=target_names,
                yticklabels=target_names,
                annot=True,
                fmt="d")
    #plt.title("Confusion Matrix")
    plt.title(titre)
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.show()

In [43]:
#print(confusion_matrix(y_test,predictions)) 
show_confusion_matrix(y_test,predictions, "Random Forest Confusion Matrix")
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))

In [44]:
predictions = NB_classifier.predict(X_test)

In [45]:
print('Training Accuracy of Naive Bayes: %.3f'%NB_classifier.score(X_train, y_train))
accuracy_tfidf = accuracy_score(y_test, predictions)
print('Test Accuracy of Naive Bayes: %.3f'%accuracy_tfidf)

In [46]:
#print(confusion_matrix(y_test,predictions)) 
show_confusion_matrix(y_test,predictions, "Naive Bayes Confusion Matrix")
print(classification_report(y_test,predictions))  
print(accuracy_score(y_test, predictions))

# **5. Predicting**

In [47]:
test_data = pd.read_csv("/kaggle/input/tweets/test.csv")

In [48]:
test_data['Label with RF']=[label for label in RF_classifier.predict(test)]
test_data['Label with NB']=[label for label in NB_classifier.predict(test)]
test_data.sample(10)