In [29]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [30]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\einst\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\einst\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
#Load Data

raw_numpy = pd.read_csv("numpy.csv")
raw_pandas = pd.read_csv("pandas.csv")

In [32]:
raw_numpy["class"]="numpy"
raw_pandas["class"]="pandas"

In [33]:
raw_numpy.head()

Unnamed: 0,Questions,class
0,numpy savez() of a list of 3D arrays with diff...,numpy
1,'DataFrame' object has no attribute 'Close' in...,numpy
2,How to Look up multiple values for one column ...,numpy
3,What are fastest data types to use as an alter...,numpy
4,rgb2hed not giving the same answer when applie...,numpy


In [34]:
numpy_data = raw_numpy
numpy_data["Questions"] = [q.lower() for q in  raw_numpy["Questions"]]
numpy_data.head()

Unnamed: 0,Questions,class
0,numpy savez() of a list of 3d arrays with diff...,numpy
1,'dataframe' object has no attribute 'close' in...,numpy
2,how to look up multiple values for one column ...,numpy
3,what are fastest data types to use as an alter...,numpy
4,rgb2hed not giving the same answer when applie...,numpy


In [35]:
pandas_data = raw_pandas
pandas_data["Questions"] = [q.lower() for q in  raw_pandas["Questions"]]
pandas_data.head()

Unnamed: 0,Questions,class
0,different results from any(df.isnull()) and pd...,pandas
1,pandas ewm correlation - not rolling,pandas
2,bar chart in python jupyter lab not plotting i...,pandas
3,python pandas groupby: how to use variables in...,pandas
4,another way to iterate to get multiple values ...,pandas


In [36]:
raw_x = pd.concat([pandas_data, numpy_data])

In [37]:
print(pandas_data.shape)
print(numpy_data.shape)
print(raw_x.shape)

(47904, 2)
(45005, 2)
(92909, 2)


In [38]:
raw_x["class"] = raw_x["class"].apply(lambda x: 0 if x == 'pandas' else 1)
raw_x.head()

Unnamed: 0,Questions,class
0,different results from any(df.isnull()) and pd...,0
1,pandas ewm correlation - not rolling,0
2,bar chart in python jupyter lab not plotting i...,0
3,python pandas groupby: how to use variables in...,0
4,another way to iterate to get multiple values ...,0


In [39]:
tokens = raw_x["Questions"].apply(lambda x: word_tokenize(x))
tokens

0        [different, results, from, any, (, df.isnull, ...
1              [pandas, ewm, correlation, -, not, rolling]
2        [bar, chart, in, python, jupyter, lab, not, pl...
3        [python, pandas, groupby, :, how, to, use, var...
4        [another, way, to, iterate, to, get, multiple,...
                               ...                        
45000    [how, to, calculate, the, mean, and, standard,...
45001    [how, can, i, apply, a, function, to, a, scrol...
45002    [populate, numpy, array, according, to, predef...
45003    [why, am, i, getting, so, many, indices, when,...
45004        [how, to, change, np.nan, type, nat, to, nan]
Name: Questions, Length: 92909, dtype: object

In [40]:
words = tokens.apply(lambda x: [w for w in x if w.isalpha()])
words

0               [different, results, from, any, and, data]
1                 [pandas, ewm, correlation, not, rolling]
2        [bar, chart, in, python, jupyter, lab, not, pl...
3        [python, pandas, groupby, how, to, use, variab...
4        [another, way, to, iterate, to, get, multiple,...
                               ...                        
45000    [how, to, calculate, the, mean, and, standard,...
45001    [how, can, i, apply, a, function, to, a, scrol...
45002    [populate, numpy, array, according, to, predef...
45003    [why, am, i, getting, so, many, indices, when,...
45004                [how, to, change, type, nat, to, nan]
Name: Questions, Length: 92909, dtype: object

In [41]:
stop_words = set(stopwords.words('english'))
filtered_words = words.apply(lambda x: [word for word in x if word not in stop_words])
filtered_words

0                               [different, results, data]
1                      [pandas, ewm, correlation, rolling]
2        [bar, chart, python, jupyter, lab, plotting, i...
3        [python, pandas, groupby, use, variables, diff...
4        [another, way, iterate, get, multiple, values,...
                               ...                        
45000    [calculate, mean, standard, deviation, similar...
45001          [apply, function, scrolling, window, numpy]
45002    [populate, numpy, array, according, predefined...
45003    [getting, many, indices, ndarray, passed, nonz...
45004                             [change, type, nat, nan]
Name: Questions, Length: 92909, dtype: object

In [42]:
# Stem the words
stemmer = PorterStemmer()
stemmed_words = filtered_words.apply(lambda x: [stemmer.stem(word) for word in x])
stemmed_words

0                                   [differ, result, data]
1                               [panda, ewm, correl, roll]
2        [bar, chart, python, jupyt, lab, plot, instead...
3        [python, panda, groupbi, use, variabl, differ,...
4        [anoth, way, iter, get, multipl, valu, singl, ...
                               ...                        
45000    [calcul, mean, standard, deviat, similar, matrix]
45001             [appli, function, scroll, window, numpi]
45002    [popul, numpi, array, accord, predefin, array,...
45003    [get, mani, indic, ndarray, pass, nonzero, fun...
45004                              [chang, type, nat, nan]
Name: Questions, Length: 92909, dtype: object

In [43]:
clean_sentence = stemmed_words.apply(lambda x: [" ".join(x)])
clean_sentence

0                                     [differ result data]
1                                  [panda ewm correl roll]
2        [bar chart python jupyt lab plot instead throw...
3        [python panda groupbi use variabl differ colum...
4        [anoth way iter get multipl valu singl date co...
                               ...                        
45000         [calcul mean standard deviat similar matrix]
45001                 [appli function scroll window numpi]
45002       [popul numpi array accord predefin array valu]
45003       [get mani indic ndarray pass nonzero function]
45004                                 [chang type nat nan]
Name: Questions, Length: 92909, dtype: object

In [44]:
clean_x = raw_x
clean_x["Questions"] = clean_sentence

In [45]:
clean_x.isnull().sum()

Questions    0
class        0
dtype: int64

In [46]:
lb = LabelEncoder()
clean_x["class"] = lb.fit_transform(clean_x["class"])
clean_x.dtypes

Questions    object
class         int64
dtype: object

In [47]:
clean_x_10 = clean_x.sample(frac=0.1)

In [48]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(clean_x["Questions"])
X = tokenizer.texts_to_sequences(clean_x_10["Questions"])
X = pad_sequences(X, maxlen=100)

y = pd.get_dummies(clean_x_10["class"]).values

In [49]:
X_unseen, X_test, y_unseen, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, shuffle=True)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [50]:
# Define the neural network model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
model.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(units=16, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(Dense(units=2, activation='sigmoid'))



In [51]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [52]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 128)          1280000   
                                                                 
 lstm_4 (LSTM)               (None, 100, 32)           20608     
                                                                 
 lstm_5 (LSTM)               (None, 16)                3136      
                                                                 
 dense_2 (Dense)             (None, 2)                 34        
                                                                 
Total params: 1,303,778
Trainable params: 1,303,778
Non-trainable params: 0
_________________________________________________________________


In [53]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2b37953c5e0>

In [54]:
# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'loss: {loss:.2f}')
print(f'Accuracy: {accuracy:.2f}')

loss: 0.64
Accuracy: 0.56


In [None]:
word2vec
phrase2vec
BAgof words
PCA

decribing scraper along woth feature descriptions
