In [1]:
import pandas as pd
import numpy as np
import nltk

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
df_start = pd.read_csv('preprocessed_df.csv').drop(columns=['Unnamed: 0'])
print(df_start.columns)

#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(df_start["Final_text"],df_start["Category"],test_size=0.2,shuffle=True)
print("X_train shape : {} \nX_test shape : {}".format(X_train.shape , X_test.shape))

Index(['Final_text', 'Category'], dtype='object')
X_train shape : (24006,) 
X_test shape : (6002,)


In [3]:
#Word2Vec
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]


In [4]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)


In [5]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10)
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)

#classification_report
print(classification_report(y_test , y_predict))
print('Confusion Matrix : \n' , confusion_matrix(y_test,y_predict))

              precision    recall  f1-score   support

         Art       0.99      0.99      0.99      2007
     Economy       0.99      0.99      0.99      1950
       Sport       0.99      1.00      0.99      2045

    accuracy                           0.99      6002
   macro avg       0.99      0.99      0.99      6002
weighted avg       0.99      0.99      0.99      6002

Confusion Matrix : 
 [[1983   16    8]
 [  10 1934    6]
 [   3    6 2036]]


In [6]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  

#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_test_vectors_tfidf)

#classification_report
print(classification_report(y_test,y_predict))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

         Art       0.99      0.99      0.99      2007
     Economy       0.99      0.99      0.99      1950
       Sport       0.99      0.99      0.99      2045

    accuracy                           0.99      6002
   macro avg       0.99      0.99      0.99      6002
weighted avg       0.99      0.99      0.99      6002

Confusion Matrix: 
 [[1981   14   12]
 [  17 1927    6]
 [   5   14 2026]]


In [7]:
# prepare the input data 

from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer


stopwords = stopwords.words('english')
WordNetLemmatiz = WordNetLemmatizer()

def remove_punctuation(text):
    punc_free = "".join(i for i in text if i not in punctuation)
    return punc_free

def remove_stopwords(text):
    stop_free = [i for i in text if i not in stopwords]
    return stop_free

def lemmatizer(text):
    lemm_text = [WordNetLemmatiz.lemmatize(word) for word in text]
    return lemm_text


# preporcessing new text input
def all_processing(text):
    
    punc = remove_punctuation(text)
    low = punc.lower()
    words = word_tokenize(low)
    stop_free = " ".join(remove_stopwords(words))
    lemm_text = "".join(lemmatizer(stop_free))
    vectors = tfidf_vectorizer.transform([lemm_text])
    
    return vectors

test_input = all_processing("Love, Anger and Song: Remembering Youssef Chahine, Egypt's most eminent filmmaker")

input_predict = lr_tfidf.predict(test_input)
input_proba = lr_tfidf.predict_proba(test_input)

print("preddicted Category is :" , (input_predict[0]))
print("Category Labeles is    :      Art      Economy     Sports")
print("Category Probilties is :" , (input_proba))

labels = ["Art","Economy","Sports"]
labels[np.argmax(input_proba)]

preddicted Category is : Art
Category Labeles is    :      Art      Economy     Sports
Category Probilties is : [[0.98761557 0.00133588 0.01104856]]


'Art'