In [None]:
import pandas as pd
import numpy as np

import os 

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, classification_report
from sklearn import datasets

In [None]:
df = pd.read_csv('amazon_au_marketing.tsv', sep='\t')

In [None]:
to_drop = [
    'Dataset Origin', 'Match Rank', 'Match Score', 
    'Match Type', 'Upc', 'Product Available Inventory', 
    'Product Url', 'Retailer', 'Product Barcode', 
    'Product Model Number', 'Product Reviews Count', 
    'Bsr', 'Joining Key', 'Market', 'Product Currency',
    'Sku', 'Uniq Id', 'Crawl Timestamp', 'Product Id',
    'Product Company Type Source', 'Product Brand Normalised Source',
    'Product Rating', 'Product Price'
]

df_cleaned = df.drop(columns=to_drop)

In [None]:
len(df_cleaned.columns)/len(df.columns)

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.describe()

In [None]:
df['Product Category'].unique()

In [None]:
len(df['Product Category'].unique())
del df

In [None]:
df_train, df_test = train_test_split(df_cleaned, random_state=123)
len(df_train)/len(df_test)

In [None]:
word = 'Home > Home DÃ©cor > Photo Frames'
df_train.columns

In [None]:
df_train.dropna(axis=0, subset=['Product Image Url'], inplace=True)
df_test.dropna(axis=0, subset=['Product Image Url'], inplace=True)

print(len(df_train)/len(df_test))

# image_train = df_train['Product Image Url']
x_train = df_train.drop(columns=['Product Category', 'Product Image Url'])
y_train = df_train['Product Category'] == word

# image_test = df_test['Product Image Url']
x_test = df_test.drop(columns=['Product Category', 'Product Image Url'])
y_test = df_test['Product Category'] == word

x_train['corpus'] = x_train.apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
x_train = x_train['corpus']

x_test['corpus'] = x_test.apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
x_test = x_test['corpus']

del df_train
del df_test

In [None]:
x_train.head().iloc[0]

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score

In [None]:
dc = DummyClassifier(strategy='prior')
dc.fit(x_train, y_train)
dc.score(x_train, y_train)

In [None]:
dc.score(x_test, y_test)

In [None]:
plot_confusion_matrix(dc, x_train, y_train)

In [None]:
countvec = CountVectorizer(stop_words='english')

In [None]:
def cross_validate_std(*args, **kwargs):
    """Like cross_validate, except also gives the standard deviation of the score"""
    res = pd.DataFrame(cross_validate(*args, **kwargs))
    res_mean = res.mean()

    res_mean["std_test_score"] = res["test_score"].std()
    if "train_score" in res:
        res_mean["std_train_score"] = res["train_score"].std()
    return res_mean

In [None]:
pipe_lr = make_pipeline(countvec, LogisticRegression())

cross_validate_std(pipe_lr, x_train, y_train, return_train_score=True, cv=5, scoring='precision')

In [None]:
pipe_lr.fit(x_train, y_train)
plot_confusion_matrix(pipe_lr, x_test, y_test)

In [None]:
171/(171+7+16)

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

In [None]:
x_train = pd.DataFrame([sms.vector for sms in nlp.pipe(x_train)])
x_test  = pd.DataFrame([sms.vector for sms in nlp.pipe(x_test)])

In [None]:
x_train_embeddings.shape

In [None]:
lr = LogisticRegression()

In [None]:
cross_validate_std(lr, X_train_embeddings, y_train, return_train_score=True)

In [None]:
plot_confusion_matrix(lr, x_test, y_test)

In [None]:
# from tensorflow.keras.datasets import mnist
# from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation, Conv2D, MaxPooling2D, GlobalAveragePooling2D
# from tensorflow.keras.models import Sequential, Model, load_model
# from tensorflow.keras.preprocessing.image import img_to_array, load_img
# from tensorflow.keras import utils
# from tensorflow.keras.applications.inception_v3 import InceptionV3
# from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions