<a href="https://colab.research.google.com/github/Ishara20/Text_analysis_using_Python/blob/main/Text_Analysis_For_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install nltk



In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
#Read course descriptions
with open("Course-Descriptions.txt", 'r') as fh:
    descriptions = fh.read().splitlines()
print("Sample course descriptions :", descriptions[:2])

#Setup stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#setup wordnet for lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import TfidfVectorizer

#Custom tokenizer that will perform tokenization, stopword removal
#and lemmatization
def customtokenize(str):
    tokens=nltk.word_tokenize(str)
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    lemmatized=[lemmatizer.lemmatize(word) for word in nostop ]
    return lemmatized

#Generate TFIDF matrix
vectorizer = TfidfVectorizer(tokenizer=customtokenize)
tfidf=vectorizer.fit_transform(descriptions)

print("\nSample feature names identified : ", vectorizer.get_feature_names_out()[:25])
print("\nSize of TFIDF matrix : ",tfidf.shape)


Sample course descriptions : ['In this practical, hands-on course, learn how to do data preparation, data munging, data visualization, and predictive analytics. ', 'PHP is the most popular server-side language used to build dynamic websites, and though it is not especially difficult to use, nonprogrammers often find it intimidating. ']

Sample feature names identified :  ["'ll" "'re" "'s" '(' ')' ',' '.' '?' 'actively' 'adopting' 'amazon'
 'analysis' 'analytics' 'application' 'applied' 'architect' 'architecture'
 'around' 'aspect' 'associate' 'aws' 'basic' 'become' 'begin' 'big']

Size of TFIDF matrix :  (20, 238)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#Building the model

In [8]:
#Loading the pre-built classifications for training
with open("Course-Classification.txt", 'r') as fh:
    classifications = fh.read().splitlines()

#Create Labels and integer classes
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(classifications)
print("Classes found : ", le.classes_)

#Convert classes to integers for use with ML
int_classes = le.transform(classifications)
print("\nClasses converted to integers :", int_classes)

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

#Split as training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(tfidf, int_classes,random_state=0)

#Build the model
classifier= MultinomialNB().fit(xtrain, ytrain)


Classes found :  ['Cloud-Computing' 'Data-Science' 'Programming']

Classes converted to integers : [1 2 2 0 1 2 1 2 0 1 1 2 2 0 2 0 0 0 2 2]


#Running Predictions

In [9]:
from sklearn import metrics

print("Testing with Test Data :\n------------------------")
#Predict on test data
predictions=classifier.predict(xtest)
print("Confusion Matrix : ")
print(metrics.confusion_matrix(ytest, predictions))
print("\n Prediction Accuracy : ",  \
      metrics.accuracy_score(ytest, predictions) )

print("\nTesting with Full Corpus :\n--------------------------")
#Predict on entire corpus data
predictions=classifier.predict(tfidf)
print("Confusion Matrix : ")
print(metrics.confusion_matrix(int_classes, predictions))
print("\n Prediction Accuracy : ",  \
      metrics.accuracy_score(int_classes, predictions) )


Testing with Test Data :
------------------------
Confusion Matrix : 
[[1 0 0]
 [0 0 1]
 [1 0 2]]

 Prediction Accuracy :  0.6

Testing with Full Corpus :
--------------------------
Confusion Matrix : 
[[6 0 0]
 [0 4 1]
 [1 0 8]]

 Prediction Accuracy :  0.9
