In [48]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, SimpleRNN
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [27]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\einst\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\einst\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
#Load Data

raw_numpy = pd.read_csv("numpy.csv")
raw_pandas = pd.read_csv("pandas.csv")

In [29]:
raw_numpy["class"]="numpy"
raw_pandas["class"]="pandas"

In [30]:
raw_numpy.head()

Unnamed: 0,Questions,class
0,numpy savez() of a list of 3D arrays with diff...,numpy
1,'DataFrame' object has no attribute 'Close' in...,numpy
2,How to Look up multiple values for one column ...,numpy
3,What are fastest data types to use as an alter...,numpy
4,rgb2hed not giving the same answer when applie...,numpy


In [31]:
numpy_data = raw_numpy
numpy_data["Questions"] = [q.lower() for q in  raw_numpy["Questions"]]
numpy_data.head()

Unnamed: 0,Questions,class
0,numpy savez() of a list of 3d arrays with diff...,numpy
1,'dataframe' object has no attribute 'close' in...,numpy
2,how to look up multiple values for one column ...,numpy
3,what are fastest data types to use as an alter...,numpy
4,rgb2hed not giving the same answer when applie...,numpy


In [32]:
pandas_data = raw_pandas
pandas_data["Questions"] = [q.lower() for q in  raw_pandas["Questions"]]
pandas_data.head()

Unnamed: 0,Questions,class
0,different results from any(df.isnull()) and pd...,pandas
1,pandas ewm correlation - not rolling,pandas
2,bar chart in python jupyter lab not plotting i...,pandas
3,python pandas groupby: how to use variables in...,pandas
4,another way to iterate to get multiple values ...,pandas


In [33]:
raw_x = pd.concat([pandas_data, numpy_data])

In [34]:
print(pandas_data.shape)
print(numpy_data.shape)
print(raw_x.shape)

(47904, 2)
(45005, 2)
(92909, 2)


In [35]:
raw_x["class"] = raw_x["class"].apply(lambda x: 0 if x == 'pandas' else 1)
raw_x.head()

Unnamed: 0,Questions,class
0,different results from any(df.isnull()) and pd...,0
1,pandas ewm correlation - not rolling,0
2,bar chart in python jupyter lab not plotting i...,0
3,python pandas groupby: how to use variables in...,0
4,another way to iterate to get multiple values ...,0


In [36]:
tokens = raw_x["Questions"].apply(lambda x: word_tokenize(x))
tokens

0        [different, results, from, any, (, df.isnull, ...
1              [pandas, ewm, correlation, -, not, rolling]
2        [bar, chart, in, python, jupyter, lab, not, pl...
3        [python, pandas, groupby, :, how, to, use, var...
4        [another, way, to, iterate, to, get, multiple,...
                               ...                        
45000    [how, to, calculate, the, mean, and, standard,...
45001    [how, can, i, apply, a, function, to, a, scrol...
45002    [populate, numpy, array, according, to, predef...
45003    [why, am, i, getting, so, many, indices, when,...
45004        [how, to, change, np.nan, type, nat, to, nan]
Name: Questions, Length: 92909, dtype: object

In [37]:
words = tokens.apply(lambda x: [w for w in x if w.isalpha()])
words

0               [different, results, from, any, and, data]
1                 [pandas, ewm, correlation, not, rolling]
2        [bar, chart, in, python, jupyter, lab, not, pl...
3        [python, pandas, groupby, how, to, use, variab...
4        [another, way, to, iterate, to, get, multiple,...
                               ...                        
45000    [how, to, calculate, the, mean, and, standard,...
45001    [how, can, i, apply, a, function, to, a, scrol...
45002    [populate, numpy, array, according, to, predef...
45003    [why, am, i, getting, so, many, indices, when,...
45004                [how, to, change, type, nat, to, nan]
Name: Questions, Length: 92909, dtype: object

In [38]:
stop_words = set(stopwords.words('english'))
filtered_words = words.apply(lambda x: [word for word in x if word not in stop_words])
filtered_words

0                               [different, results, data]
1                      [pandas, ewm, correlation, rolling]
2        [bar, chart, python, jupyter, lab, plotting, i...
3        [python, pandas, groupby, use, variables, diff...
4        [another, way, iterate, get, multiple, values,...
                               ...                        
45000    [calculate, mean, standard, deviation, similar...
45001          [apply, function, scrolling, window, numpy]
45002    [populate, numpy, array, according, predefined...
45003    [getting, many, indices, ndarray, passed, nonz...
45004                             [change, type, nat, nan]
Name: Questions, Length: 92909, dtype: object

In [39]:
# Stem the words
stemmer = PorterStemmer()
stemmed_words = filtered_words.apply(lambda x: [stemmer.stem(word) for word in x])
stemmed_words

0                                   [differ, result, data]
1                               [panda, ewm, correl, roll]
2        [bar, chart, python, jupyt, lab, plot, instead...
3        [python, panda, groupbi, use, variabl, differ,...
4        [anoth, way, iter, get, multipl, valu, singl, ...
                               ...                        
45000    [calcul, mean, standard, deviat, similar, matrix]
45001             [appli, function, scroll, window, numpi]
45002    [popul, numpi, array, accord, predefin, array,...
45003    [get, mani, indic, ndarray, pass, nonzero, fun...
45004                              [chang, type, nat, nan]
Name: Questions, Length: 92909, dtype: object

In [40]:
clean_sentence = stemmed_words.apply(lambda x: [" ".join(x)])
clean_sentence

0                                     [differ result data]
1                                  [panda ewm correl roll]
2        [bar chart python jupyt lab plot instead throw...
3        [python panda groupbi use variabl differ colum...
4        [anoth way iter get multipl valu singl date co...
                               ...                        
45000         [calcul mean standard deviat similar matrix]
45001                 [appli function scroll window numpi]
45002       [popul numpi array accord predefin array valu]
45003       [get mani indic ndarray pass nonzero function]
45004                                 [chang type nat nan]
Name: Questions, Length: 92909, dtype: object

In [41]:
clean_x = raw_x
clean_x["Questions"] = clean_sentence

In [42]:
clean_x.isnull().sum()

Questions    0
class        0
dtype: int64

In [43]:
lb = LabelEncoder()
clean_x["class"] = lb.fit_transform(clean_x["class"])
clean_x.dtypes

Questions    object
class         int64
dtype: object

In [44]:
clean_x_10 = clean_x.sample(frac=0.1)

In [51]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(clean_x["Questions"])
X = tokenizer.texts_to_sequences(clean_x["Questions"])
X = pad_sequences(X, maxlen=500)
one_hot_results=tokenizer.texts_to_matrix(clean_x, mode="binary")
word_index=tokenizer.word_index
X.shape

(92909, 500)

In [59]:
model=Sequential()
model.add(Embedding(10000,32))
model.add(SimpleRNN(32))
model.add(Dense(2,activation="sigmoid"))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 32)          320000    
                                                                 
 simple_rnn_6 (SimpleRNN)    (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 322,146
Trainable params: 322,146
Non-trainable params: 0
_________________________________________________________________


In [60]:
y = pd.get_dummies(clean_x["class"]).values

In [67]:
model.compile(optimizer="adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])
history=model.fit(X, y, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
 42/581 [=>............................] - ETA: 2:53 - loss: 0.6446 - accuracy: 0.6363

KeyboardInterrupt: 

In [62]:
X_unseen, X_test, y_unseen, y_test = train_test_split(X, y, test_size=0.15, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, shuffle=True)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [71]:
# Define the neural network model
model2 = Sequential()
# model.add(Embedding(input_dim=10000, output_dim=128, input_length=100))
# model.add(LSTM(units=32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
# model.add(LSTM(units=16, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
# model.add(Dense(units=2, activation='sigmoid'))

# Input - Layer
model2.add(Embedding(input_dim=10000, output_dim=128, input_length=500))
# Hidden - Layers
model2.add(Dropout(0.4, noise_shape=None, seed=None))
model2.add(Dense(50, activation = "relu"))
model2.add(Dropout(0.3, noise_shape=None, seed=None))
model2.add(Dense(50, activation = "relu"))
# Output- Layer
model2.add(Dense(2, activation = "sigmoid"))
model2.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 dropout_6 (Dropout)         (None, 500, 128)          0         
                                                                 
 dense_12 (Dense)            (None, 500, 50)           6450      
                                                                 
 dropout_7 (Dropout)         (None, 500, 50)           0         
                                                                 
 dense_13 (Dense)            (None, 500, 50)           2550      
                                                                 
 dense_14 (Dense)            (None, 500, 2)            102       
                                                                 
Total params: 1,289,102
Trainable params: 1,289,102
No

In [72]:
# Compile the model
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [73]:
model2.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 dropout_6 (Dropout)         (None, 500, 128)          0         
                                                                 
 dense_12 (Dense)            (None, 500, 50)           6450      
                                                                 
 dropout_7 (Dropout)         (None, 500, 50)           0         
                                                                 
 dense_13 (Dense)            (None, 500, 50)           2550      
                                                                 
 dense_14 (Dense)            (None, 500, 2)            102       
                                                                 
Total params: 1,289,102
Trainable params: 1,289,102
No

In [74]:
# Train the model
model2.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5


ValueError: in user code:

    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\einst\.conda\envs\tf\lib\site-packages\keras\backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 2) and (None, 500, 2) are incompatible


In [54]:
# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'loss: {loss:.2f}')
print(f'Accuracy: {accuracy:.2f}')

loss: 0.64
Accuracy: 0.56


In [None]:
word2vec
phrase2vec
BAgof words
PCA

decribing scraper along woth feature descriptions
