In [20]:
import re
import joblib
import string
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
df =pd.read_csv('multiclass-text-data.csv')

In [4]:
df.head()

Unnamed: 0,category,text
0,tech,New app ChatterAI trends among users
1,sports,Real Madrid winds the championship after budge...
2,politics,Leader David kim addresses nation on education...
3,entertainment,streaming platform releases The last Dance
4,entertainment,Director Samantha Lee plans sequel to Space Rush


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\s+',' ', text).strip()
    return text

In [9]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [10]:
df

Unnamed: 0,category,text,cleaned_text
0,tech,New app ChatterAI trends among users,new app chatterai trends among users
1,sports,Real Madrid winds the championship after budge...,real madrid winds the championship after budge...
2,politics,Leader David kim addresses nation on education...,leader david kim addresses nation on education...
3,entertainment,streaming platform releases The last Dance,streaming platform releases the last dance
4,entertainment,Director Samantha Lee plans sequel to Space Rush,director samantha lee plans sequel to space rush
...,...,...,...
62,biology,The human body is made up of about 60% water.,the human body is made up of about water
63,history,The Industrial Revolution transformed manufact...,the industrial revolution transformed manufact...
64,art,Vincent van Gogh painted Starry Night.,vincent van gogh painted starry night
65,physics,Airplanes fly due to the principles of lift an...,airplanes fly due to the principles of lift an...


In [12]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df['cleaned_text'])

In [13]:
x.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [42]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(df['cleaned_text'])

array([23, 30, 20, 35, 10, 36, 16, 13,  9,  6, 38, 24, 19, 25, 23, 37, 40,
       28,  3, 49, 33, 64, 42, 17, 54, 26, 45, 39,  5, 52, 21, 47, 53, 12,
       51,  7, 56,  8, 62, 50, 11, 32, 61,  4, 57,  0, 59, 14, 60, 43, 18,
       34, 41, 41, 58, 29, 22, 27, 55, 15, 48,  1, 44, 46, 63,  2, 31])

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['cleaned_text'])

In [41]:
df

Unnamed: 0,category,text,cleaned_text,category_encoded
0,tech,New app ChatterAI trends among users,new app chatterai trends among users,23
1,sports,Real Madrid winds the championship after budge...,real madrid winds the championship after budge...,30
2,politics,Leader David kim addresses nation on education...,leader david kim addresses nation on education...,20
3,entertainment,streaming platform releases The last Dance,streaming platform releases the last dance,35
4,entertainment,Director Samantha Lee plans sequel to Space Rush,director samantha lee plans sequel to space rush,10
...,...,...,...,...
62,biology,The human body is made up of about 60% water.,the human body is made up of about water,44
63,history,The Industrial Revolution transformed manufact...,the industrial revolution transformed manufact...,46
64,art,Vincent van Gogh painted Starry Night.,vincent van gogh painted starry night,63
65,physics,Airplanes fly due to the principles of lift an...,airplanes fly due to the principles of lift an...,2


In [45]:
df.tail(20)

Unnamed: 0,category,text,cleaned_text,category_encoded
47,biology,Elephants have excellent memory and strong soc...,elephants have excellent memory and strong soc...,14
48,music,The violin is a string instrument commonly use...,the violin is a string instrument commonly use...,60
49,biology,The heart pumps blood throughout the human body.,the heart pumps blood throughout the human body,43
50,food science,Ice cream melts when exposed to heat.,ice cream melts when exposed to heat,18
51,technology,Solar panels convert sunlight into electricity.,solar panels convert sunlight into electricity,34
52,history,The French Revolution began in 1789.,the french revolution began in,41
53,history,The French Revolution began in 1789.,the french revolution began in,41
54,history,The Taj Mahal is a mausoleum built by Emperor ...,the taj mahal is a mausoleum built by emperor ...,58
55,physics,Rainbows are caused by light refracting throug...,rainbows are caused by light refracting throug...,29
56,geography,Mount Everest is the tallest mountain in the w...,mount everest is the tallest mountain in the w...,22


In [17]:
y = df['category_encoded']

In [39]:
y

0     23
1     30
2     20
3     35
4     10
      ..
62    44
63    46
64    63
65     2
66    31
Name: category_encoded, Length: 67, dtype: int32

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [21]:
model = MultinomialNB()
model.fit(x_train,y_train)

  self.y_type_ = type_of_target(y, input_name="y")
  ys_types = set(type_of_target(x) for x in ys)
  y_is_multilabel = type_of_target(y).startswith("multilabel")
  y_type = type_of_target(y)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [22]:
y_pred = model.predict(x_test)

In [23]:
print(confusion_matrix(y_test,y_pred))

[[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]]


In [27]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1
          23       1.00      1.00      1.00         1
          26       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          40       0.00      0.00      0.00         1
          41       0.00      0.00      0.00         0
          56       0.00      0.00      0.00         1
          63       0.00      0.00      0.00         1

    accuracy                           0.07        14
   macro avg       0.07      0.07      0.07        14
weighted avg       0.07   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [28]:
confusion_matrix(y_test,y_pred)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]], dtype=int64)

In [29]:
joblib.dump(model,'MultinomialNB_spam.pkl')

['MultinomialNB_spam.pkl']

In [30]:
joblib.dump(vectorizer,'tfidf_MultinomialNB_spam.pkl')

['tfidf_MultinomialNB_spam.pkl']

In [55]:
lodel_model_classification = joblib.load('MultinomialNB_spam.pkl')

In [56]:
load_model_vec = joblib.load('tfidf_MultinomialNB_spam.pkl')

In [60]:
sample_text = ['coffee contains caffeine, a natural stimulant.', 'rainbows are caused by light refracting through water droplets.', 'the French Revolution began in 1789']

In [61]:
sample_cleaned = [clean_text(i) for i in sample_text]

In [62]:
sample_cleaned

['coffee contains caffeine a natural stimulant',
 'rainbows are caused by light refracting through water droplets',
 'the french revolution began in']

In [63]:
sample_cleaned = load_model_vec.transform(sample_cleaned)

In [64]:
sample_cleaned.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [65]:
lodel_model_classification.predict(sample_cleaned)

array([ 8, 29, 41])