In [1]:
import numpy as np
import pandas as pd

In [2]:
data=r'quora_questions.csv'
df=pd.read_csv(data)

In [3]:
df

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."
...,...
404284,How many keywords are there in the Racket prog...
404285,Do you believe there is life after death?
404286,What is one coin?
404287,What is the approx annual cost of living while...


## Preprocessing

In [11]:
import re

df['prep']=df['Question'].apply(lambda x:re.sub("[^A-Za-z]+",' ',x))

df['prep']=df['prep'].apply(lambda x:re.sub(" \w ",' ',x))

df['prep']=df['prep'].apply(lambda x:re.sub(" \w\w ",' ',x))

df['prep']=df['prep'].apply(lambda x:x.strip())
df['prep']=df['prep'].apply(lambda x:x.lower())

## Feature Extraction

In [17]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [23]:
' '.join(['a','b','c'])

'a b c'

In [30]:
def lemmatizer(text):
    x=[word.lemma_ for word in nlp(text)]
    return ' '.join(x)

In [40]:
df['Question'].iloc[0]
lemmatizer(df['Question'].iloc[0])

'what be the step by step guide to invest in share market in india ?'

In [None]:
df['Question']=df['Question'].apply(lemmatizer)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

dtm=tfidf.fit_transform(df['prep'])

# pd.DataFrame(data=dtm.toarray(),columns=tfidf.get_feature_names_out())

In [149]:
dtm

<404289x36620 sparse matrix of type '<class 'numpy.float64'>'
	with 1937429 stored elements in Compressed Sparse Row format>

## NMF

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf=NMF(n_components=20,random_state=42)

In [152]:
nmf.fit(dtm)

In [154]:
nmf.components_.shape

(20, 36620)

In [170]:
for index,comp in enumerate(nmf.components_):
    print(f"Topic:{index}")
    top_words=comp.argsort()[-15:]
    print(tfidf.get_feature_names_out()[top_words])
    print('\n')

Topic:0
['read' 'place' 'smartphone' 'mobile' 'visit' 'places' 'ways' 'movie'
 'buy' 'phone' 'books' 'book' 'laptop' 'movies' 'best']


Topic:1
['grads' 'recruit' 'looking' 'differ' 'sex' 'use' 'exist' 'really'
 'compare' 'cost' 'long' 'feel' 'work' 'mean' 'does']


Topic:2
['investment' 'help' 'free' 'home' 'easy' 'internet' 'banning' 'rupee'
 'youtube' 'notes' 'black' 'ways' 'earn' 'online' 'money']


Topic:3
['add' 'answered' 'needing' 'post' 'easily' 'improvement' 'delete' 'asked'
 'google' 'answers' 'answer' 'ask' 'question' 'questions' 'quora']


Topic:4
['balance' 'earth' 'day' 'death' 'changed' 'want' 'live' 'change' 'moment'
 'real' 'important' 'thing' 'meaning' 'purpose' 'life']


Topic:5
['america' 'happen' 'presidency' 'think' 'presidential' 'vote' 'better'
 'election' 'did' 'win' 'hillary' 'president' 'clinton' 'donald' 'trump']


Topic:6
['english' 'beginner' 'beginners' 'computer' 'hacking' 'book' 'want'
 'python' 'languages' 'java' 'start' 'learning' 'language' 'program

In [163]:
topic_results=nmf.fit_transform(dtm)

In [165]:
topic_results.shape

(404289, 20)

In [166]:
df['topic']=topic_results.argmax(axis=1)

In [167]:
df

Unnamed: 0,Question,prep,topic
0,What is the step by step guide to invest in sh...,what the step step guide invest share market i...,8
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,what the story kohinoor koh noor diamond,15
2,How can I increase the speed of my internet co...,how can increase the speed my internet connect...,17
3,Why am I mentally very lonely? How can I solve...,why mentally very lonely how can solve,9
4,"Which one dissolve in water quikly sugar, salt...",which one dissolve water quikly sugar salt met...,1
...,...,...,...
404284,How many keywords are there in the Racket prog...,how many keywords are there the racket program...,6
404285,Do you believe there is life after death?,do you believe there life after death,4
404286,What is one coin?,what one coin,8
404287,What is the approx annual cost of living while...,what the approx annual cost living while study...,9
