In [1]:
# importing libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
import re       #regular expression is used to search the words in a text
import spacy
import nltk 
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords  #stopwords means those words which doesn't add much value to text context like articles
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import numpy as np

ModuleNotFoundError: No module named 'spacy'

In [2]:
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.4.2-cp39-cp39-win_amd64.whl (11.9 MB)
     ---------------------------------------- 11.9/11.9 MB 3.2 MB/s eta 0:00:00
Collecting thinc<8.2.0,>=8.1.0
  Downloading thinc-8.1.5-cp39-cp39-win_amd64.whl (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 3.4 MB/s eta 0:00:00
Collecting spacy-legacy<3.1.0,>=3.0.10
  Downloading spacy_legacy-3.0.10-py2.py3-none-any.whl (21 kB)
Collecting jinja2
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
     -------------------------------------- 133.1/133.1 kB 4.0 MB/s eta 0:00:00
Collecting pathy>=0.3.5
  Downloading pathy-0.6.2-py3-none-any.whl (42 kB)
     ---------------------------------------- 42.8/42.8 kB 2.0 MB/s eta 0:00:00
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp39-cp39-win_amd64.whl (18 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.2-cp39-cp39-


[notice] A new release of pip available: 22.2.2 -> 22.3
[notice] To update, run: python.exe -m pip install --upgrade pip


### Importing dataset

In [None]:
  #useful in creaing dataframs and storing the data in the dataframes
data  = pd.read_csv("https://github.com/srivatsan88/YouTubeLI/blob/master/dataset/consumer_compliants.zip?raw=true",compression='zip')
data.head(2)

### Exploratory data analysis

In [None]:
data['Product'].value_counts()

In [None]:
data['Company'].value_counts()

In [None]:
# considering usefull data columns (Product,company,Consumer complaint narrative)

complaint_data = data[['Product','Company','Consumer complaint narrative']].rename(columns={'Consumer complaint narrative':'Complaint'})

pd.set_option('display.max_colwidth', -1)
complaint_data

In [None]:
# more exploration on dataset
print(complaint_data.isnull().count())

In [None]:
print(complaint_data.dtypes)

### **Data Preprocessing**

In [None]:
# data set split for further analysis 

train,test = train_test_split(complaint_data,test_size=0.3,random_state=42)

In [None]:
train.shape

In [None]:

train['Complaint_processed'] = train['Complaint'].map(lambda x: re.sub('[,\.!?]', '', x))  #(˄ means exclusion of everything which is not alphabets and replacing it with null space)

In [None]:
train['Complaint_processed'] = train['Complaint_processed'].map(lambda x: x.lower())   #converting all the letters into lowercase
train['Complaint_processed'].head()   

In [None]:
!pip install nltk   #natural language toolkit 

In [None]:
stop_words = (stopwords.words('english'))

stemmer = PorterStemmer()
stemmer = nltk.stem.SnowballStemmer('english')

## Tokenisation

In [None]:

def tokenize(text):
   tokens = [word for word in nltk.word_tokenize(text) if (len(word) > 3 and len(word.strip('Xx/')) > 2 and len(re.sub('\d+', '', word.strip('Xx/'))) > 3) ] 
  #  tokens = map(str.lower, tokens)
   stems = [stemmer.stem(item) for item in tokens if (item not in stop_words)]
   return stems


docs = train.Complaint_processed.values.tolist()
data_words = [tokenize(doc) for doc in docs]

In [None]:
print(len(data_words))

data_words[:2]

## Convert Text into Numerical Representation


In [None]:
# In Natural language Processing there are many vectorizer which can convert text tokens to numerical tokens here we will try only two method 
# 1 - TF-IFD vectorizer    
# 2 - Count Vectorizer
# You can  find more about them in Document file

tf_ifd  = TfidfVectorizer(tokenizer=tokenize, max_df = 0.75, min_df=50 , max_features=10000,use_idf= False,  lowercase=False )

# Converting text into numerical representation
cv = CountVectorizer(tokenizer=tokenize, max_df = 0.75, min_df=50 , max_features=10000, lowercase=False)

In [None]:
tf_vector = tf_ifd.fit_transform(train.Complaint_processed)
vc_vectore = cv.fit_transform(train.Complaint_processed)

In [None]:
print("tf_vector matrx",tf_vector.A)
tf_vector.shape

In [None]:
print("count vectorizer matrix ",vc_vectore.A)
vc_vectore.shape

In [None]:
cv_voc = (cv.get_feature_names())


In [None]:
tf_voc = tf_ifd.get_feature_names()

In [None]:
print("length of tf_ifd vector vocabalury is :",len(tf_ifd.get_feature_names()))
print("length of cv vector vocabalury is :",len(cv.get_feature_names()))

## LDA Implementation Using Sckit learn

In [None]:
n_topics = 8
model_lda = LDA(n_components = n_topics, learning_method='online',max_iter = 20,learning_offset=50,n_jobs = -1, random_state = 42)
# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = model_lda.fit_transform(tf_vector)


In [None]:

# .components_ gives us our topic distribution 
topic_words = model_lda.components_

In [None]:
X_topics

In [None]:
topic_words

So, **what X_topics and topic_words represents ?**


*  **X_topics**: represents probability of each topic to be assigned to each document. So, it should be (no. of doc , no. of topics) shape.It is known as Document-topic matrix.
*   **topic_words**: represents probability of particualr word being in particular topic. So, it is (no. of topics, no. of words) shape.It is known as Topic-Word matrix.



In [None]:
print(topic_words.shape)
print(X_topics.shape)

In [None]:
n_top_words = 15
topics = []

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(tf_voc)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we are extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    topics.append(topic_words)
    print ("Topic", str(i), topic_words)
topics=np.array(topics)

In [None]:
doc_topic = model_lda.transform(tf_vector)  
topic_to_doc = []
# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    topic_to_doc.append(topic_doc)

def doc_to_topic(doc_no,topic_to_doc=topic_to_doc):
  print("topic assigned to document ",doc_no," is ",topic_to_doc[doc_no+1]," that is ",topics[topic_to_doc[doc_no+1]])

doc_to_topic(2)

In [None]:
topic_assigned_to_doc = pd.DataFrame(X_topics,columns=['topic0','topic1','topic2','topic3','topic4','topic5','topic6','topic7'])
topic_assigned_to_doc['topic_assigned'] = topic_to_doc

In [None]:
topic_assigned_to_doc

In [None]:
from sklearn.cluster import KMeans
km = KMeans(8,init='k-means++',max_iter=20)
km.fit(tf_vector)

In [None]:
centroids = km.cluster_centers_.argsort()[:,::-1]
for i in range(8):
  print("centroid ", i)
  for ind in centroids[i,:15]:
    print(tf_voc[ind],end=' ')
  print()

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()

Non-negative Matrix Factorization (NNMF)

In [None]:
from sklearn.decomposition import NMF


In [None]:
tf_ifd  = TfidfVectorizer(tokenizer=tokenize, max_df = 0.75, min_df=50 , max_features=10000,use_idf= False,  lowercase=False )
X = tf_ifd.fit_transform(complaint_data.Complaint)
words = np.array(tf_ifd.get_feature_names())

print(X)
print("X = ", words)

In [None]:
# Applying Non-Negative Matrix Factorization
 
model_nmf = NMF(n_components=8, solver="mu")
W = model_nmf.fit_transform(X)
H = model_nmf.components_

for i, topic in enumerate(H):
     print("Topic {}: {}".format(i + 1, ",".join([str(x) for x in words[topic.argsort()[-10:]]])))

In [None]:
doc_topic_byNMF = model_nmf.transform(X)  
topic_to_doc_byNMF = []
# iterating over ever value till the end value
for n in range(doc_topic_byNMF.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic_byNMF[n].argmax()
    topic_to_doc_byNMF.append(topic_doc)

def doc_to_topic_byNMF(doc_no,topic_to_doc_byNMF=topic_to_doc_byNMF):
  print("topic assigned to document ",doc_no," is ",topic_to_doc_byNMF[doc_no+1]," that is ",topics[topic_to_doc_byNMF[doc_no+1]])

doc_to_topic_byNMF(2)

In [None]:
topic_assigned_to_doc_byNMF = pd.DataFrame(W,columns=['topic0','topic1','topic2','topic3','topic4','topic5','topic6','topic7'])
topic_assigned_to_doc_byNMF['topic_assigned'] = topic_to_doc_byNMF

In [None]:
topic_assigned_to_doc_byNMF

Latent Semantic Analysis (LSA)

In [None]:
from scipy import linalg, spatial
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.feature_extraction.text import (CountVectorizer, TfidfTransformer, TfidfVectorizer)

from sklearn.utils.extmath import randomized_svd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords