In [1]:
import pandas as pd 
import numpy as np
import re

# Visualisation libraries
import seaborn as sns

import missingno as msno #For missing value visualization
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud

# From here: https://www.tutorialspoint.com/plotly/plotly_plotting_inline_with_jupyter_notebook.htm
import plotly.offline as py
py.init_notebook_mode(connected=True)

# string utility
import string

# main nlp library and modell
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

import time

# Calculation of Performance of Models
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score


# Modelling Purpose
# Regression model
from sklearn.linear_model import LogisticRegression

# Ensemble model
from sklearn.tree import DecisionTreeClassifier

# Essentially, Random Forest is a group of decision trees
from sklearn.ensemble import RandomForestClassifier

# 

# Support Vector Classifier, based on SVMs (Support Vector Machines)
from sklearn.svm import SVC

# https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
from sklearn.naive_bayes import MultinomialNB

from sklearn.tree import ExtraTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

# Random classification, ignoring inputs
from sklearn.dummy import DummyClassifier

## K-nearest neighbor classifieer
from sklearn.neighbors import KNeighborsClassifier

## SGDClassifier
from sklearn.linear_model import SGDClassifier

In [3]:
## data Exploration
#ibrahim_data = pd.read_csv(r"data\ibrahim.csv", sep=";")
#florina_data = pd.read_csv(r"data\florina.csv", sep=";")
#mohamed_data = pd.read_csv(r"data\mohamed.csv", sep=";")

dataset = pd.read_csv(r"data\ready_data.csv", sep=";")[["news", "label"]]

ParserError: Error tokenizing data. C error: Expected 3 fields in line 4, saw 10


In [None]:
dataset.info()

In [None]:
dataset.head()

In [None]:
## data cleaning
dataset = dataset[["news", "label"]]
dataset.head()

In [None]:
## null value 
msno.matrix(dataset)

In [None]:
## drop rows with null label
dataset = dataset[dataset['label'].notna()]

In [None]:
dataset.info()

In [None]:
#cleaning unecessary text from the string 
def clean(text):
    # cleanup 
    #text = re.sub('<.*?>+',' ',text) #removing HTML Tags
    #text = re.sub('\n', ' ',text) #removal of new line characters
    #text = re.sub(r'\s+', ' ',text) #removal of multiple spaces
    
    # tokenize and analyze text
    doc = nlp(text)
    
    # concatenate tokens that are not stopwords and only alphabethic letters
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    cleaned_text = " ".join(tokens) if len(tokens)>0 else None

    return cleaned_text

In [None]:
dataset['news'] = dataset['news'].apply(clean)

In [None]:
dataset.head()

In [None]:
dataset["news"].isna().value_counts().plot(kind="bar")

In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset["news"].isna().value_counts().plot(kind="bar")

In [None]:
# store cleaned data
dataset.to_csv("data\cleaned_data.csv", index=False)

In [None]:
# load cleaned data
data = pd.read_csv("data\cleaned_data.csv")
print(f"Rows with empy columns: {data.isna().sum()}")

In [None]:
data['label'].value_counts()

In [None]:
genre_df = pd.DataFrame({'label':['neutral', 'positiv', 'negativ'],'count':[4009, 1570, 1506]})
fig = px.histogram(genre_df,x = 'label',y = 'count',color = 'label')
fig.show()

In [None]:
def print_word_cloud(label,news):
    print(label)
    wordcloud = WordCloud(width = 400, height = 400, 
                background_color ='white', 
                min_font_size = 10).generate(news)
    plt.figure(figsize = (7, 7), facecolor = 'white', edgecolor='blue') 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()

def make_string(label):
    news_str = ""
    for row_index,row in data[data['label']==label].iterrows():
        news_str += " " + row['news']
    return news_str

In [None]:
labels = data["label"].unique().tolist()
for l in labels:
    news_string = make_string(l)
    print_word_cloud(l, news_string)

In [None]:
## modelling

#Converting all the categorical features of 'label' to numerical
data['label'] = LabelEncoder().fit_transform(data['label'])
data.head()

In [None]:
X = CountVectorizer().fit_transform(data['news'])
y = data['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
models = [MultinomialNB(),LogisticRegression(),RandomForestClassifier(),SVC(),DummyClassifier(),DecisionTreeClassifier(), KNeighborsClassifier(), SGDClassifier()]

In [None]:
## without using onevsrest
Name = []
Accuracy = []
Precision = []
F1_Score = []
Recall = []
Time_Taken = []
for model in models:
    name = type(model).__name__
    Name.append(name)
    begin = time.time()
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    end = time.time()
    Accuracy.append(accuracy_score(prediction,y_test))
    Precision.append(precision_score(prediction,y_test,average = 'macro'))
    Recall.append(recall_score(prediction,y_test,average = 'macro'))
    F1_Score.append(f1_score(prediction,y_test,average = 'macro'))
    Time_Taken.append(end-begin)
    print(name + ' Successfully Trained')

In [None]:
Dict = {'Name':Name,'Accuracy':Accuracy,'Precision_score':Precision,'Recall_score':Precision,
        'F1_score':F1_Score,'Time Taken':Time_Taken}
model_df = pd.DataFrame(Dict)
model_df

In [None]:
model_df.sort_values(by = 'Accuracy',ascending = False,inplace = True)
fig = px.line(model_df, x="Name", y="Accuracy", title='Accuracy VS Model')
fig.show()

In [None]:
model_df.sort_values(by = 'Time Taken',ascending = False,inplace = True)
fig = px.line(model_df, x="Name", y="Time Taken", title='Time Taken VS Model')
fig.show()