<a href="https://colab.research.google.com/github/MiraBou/TweetsClassification/blob/main/TweetsClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Tweets Classification notebook**


# EDA

In [186]:
import pandas as pd
import numpy as np


In [187]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [188]:
train.head(5)

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...


In [189]:
train.shape

(6525, 3)

In [190]:
train["TweetText"].isnull().sum()

0

In [191]:
train["Label"].value_counts()

Sports      3325
Politics    3200
Name: Label, dtype: int64

In [192]:
y=train["Label"]


In [193]:
train.drop(['TweetId','Label'],axis=1,inplace=True)


In [194]:
train

Unnamed: 0,TweetText
0,'#SecKerry: The value of the @StateDept and @U...
1,'@rraina1481 I fear so'
2,'Watch video highlights of the #wwc13 final be...
3,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,'@cricketfox Always a good thing. Thanks for t...
...,...
6520,'Photo: PM has laid a wreath at Martyrs Monume...
6521,'The secret of the Chennai pitch - crumbling o...
6522,@alinabhutto he isn't on Twitter either
6523,'Which England player would you take out to di...


# Dataset Cleaning

In [195]:
import string 
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.corpus import wordnet

def remove_punctuation(text):
    punctuationfree ="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

wordLemmatizer = nltk.stem.WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def do_lemmatizing_with_POS(in_str):
    new_str=''
    for word in in_str.split():
        tag=nltk.pos_tag(word)[0][1][0]
        new_str=new_str + wordLemmatizer.lemmatize(word, wordnet_map.get(tag,wordnet.VERB)) + " "
    return new_str



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [196]:
from collections import Counter
def clean_data(new_text):
 #new_data=[]
   new_text = remove_punctuation(new_text)
   new_text = re.sub('https?://\S+|www\.\S+',"",new_text)
   new_text = re.sub(r'\d+',"",new_text)
   new_text = new_text.lower()
   list_text = new_text.split()
   words = stopwords.words('english')
   for word in words: 
     if word in list_text:
      list_text.remove(word)
   new_text=' '.join(list_text)

   return new_text


results = Counter()
d= pd.DataFrame(data=train)
d['countTweet'] = d['TweetText'].str.split().str.len()
d['clean']=d["TweetText"].apply(clean_data)
d['clean'].str.lower().str.split().apply(results.update)
d['count'] = d['clean'].str.split().str.len()
print(len(results.keys()))
#print(results)

16192


### **Lemmatization**

In [197]:
d['lemma']=d["clean"].apply(do_lemmatizing_with_POS)
d.head()

Unnamed: 0,TweetText,countTweet,clean,count,lemma
0,'#SecKerry: The value of the @StateDept and @U...,21,seckerry value the statedept usaid measured do...,13,seckerry value the statedept usaid measured do...
1,'@rraina1481 I fear so',4,rraina fear,2,rraina fear
2,'Watch video highlights of the #wwc13 final be...,14,watch video highlights wwc final australia wes...,9,watch video highlight wwc final australia west...
3,'RT @chelscanlan: At Nitro Circus at #AlbertPa...,12,rt chelscanlan nitro circus at albertpark they...,10,rt chelscanlan nitro circus at albertpark they...
4,'@cricketfox Always a good thing. Thanks for t...,10,cricketfox always good thing thanks feedback,6,cricketfox always good thing thanks feedback


In [198]:
result= Counter()
d['lemma'].str.lower().str.split().apply(result.update)
print(len(result.keys()))

15109


# Modeling

### Vectorization

In [199]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=len(result.keys()))
vector = vectorizer.fit_transform(d["lemma"])
print(vector.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vector, y, test_size=0.33, random_state=42)

(6525, 15087)


### Model

In [200]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [201]:
encoder =LabelEncoder()
y_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.fit_transform(y_test)

In [202]:
class AggregatorModel():
    def __init__(self, models: list = None):
        self.models = models

    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> object:
        return [
            model.fit(x_train, y_train) for model in self.models
        ]

    def predict(self, x_test: np.ndarray) -> np.ndarray:
        return (np.mean([model.predict(x_test) for model in self.models], axis=0) == 1).astype(int)


models=[BernoulliNB(),SVC(),RandomForestClassifier()]
model = AggregatorModel(models)
model.fit(X_train,y_encoded) 

y_train_p=model.predict(X_train)
y_predicted=model.predict(X_test)

print("Model accuracy:", metrics.accuracy_score(y_encoded, y_train_p)*100)
print("Model accuracy:", metrics.accuracy_score(y_test_encoded, y_predicted)*100)

Model accuracy: 99.7483413406543
Model accuracy: 94.01114206128133


# Test Cleaning and Prediction

In [203]:
results_test = Counter()
d_test= pd.DataFrame(data=test)
d_test['countTweet'] = d_test['TweetText'].str.split().str.len()
d_test['clean']=d_test["TweetText"].apply(clean_data)
d_test['clean'].str.lower().str.split().apply(results_test.update)
d_test['count'] = d_test['clean'].str.split().str.len()
print(len(results_test.keys()))

8478


In [182]:
d_test['lemma']=d_test["clean"].apply(do_lemmatizing_with_POS)
d_test.head()
result_test= Counter()
d_test['lemma'].str.lower().str.split().apply(result_test.update)
print(len(result_test.keys()))

7943


In [183]:
vector_test = vectorizer.transform(test["lemma"])
print(vector_test.shape)

(2610, 15087)


In [184]:
def to_classes(label):
    print(label)
    if label == 0:
        return "Politics" 
    return "Sports"

In [None]:
test['Label']=[to_classes(label) for label in model.predict(vector_test)]
test.drop(['clean','count','countTweet','lemma'],axis=1,inplace=True)
test.to_csv('result.csv',index = False)