<a href="https://colab.research.google.com/github/JaxonBradshaw/455Final/blob/main/FinalProject455.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install pyLDAvis
# !pip install pyLDAvis.gensim
# !pip install logging
# !pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_sm
# !pip install spacy
# !pip install gower

# Step 1: Data Collection

In [None]:
def collectData(url, numberOfRows, brand):
  import requests
  import pandas as pd
  import json
  from bs4 import BeautifulSoup
  df = pd.DataFrame(columns = ['name', 'gender', 'silhouette', 'releaseDate', 'retailPrice', 'estimatedMarketValue', 'story', 'image'])
  numberOfPages = int(numberOfRows / 100)

  #Filter through pages of the api data, collecting 100 shoes at a time
  for i in range(numberOfPages):

    #Include a filter by brand (We only want Jordans)
    querystring = {"limit":"100", 'page': str(i + 1), 'brand': brand}

    headers = {
          #This API only allows 200 calls a month (on the free version)
          #Update this key to a new account when you run out of calls
          'x-rapidapi-host': 'the-sneaker-database.p.rapidapi.com',
          'x-rapidapi-key': 'c14d7bed93msh87c86d4d1ac7f06p174d68jsnbd2166c5d18b'
        }

    #Request the URL, if it doesn't work, return the page number that we are on
    try: 
      response = requests.request("GET", url, headers=headers, params=querystring)
      json_data = json.loads(response.text)
    except: 
      print(i + 1)

    #Filter through the results and only add shoes that have both an image and a story
    #This will result in returning less rows than the desired "numberOfRows" variable 
    for shoe in json_data['results']:
      if shoe['image']['original'] != '' and shoe['story'] != '' and shoe['retailPrice'] != 0:
        df.loc[shoe['name']] = shoe['name'], shoe['gender'], shoe['silhouette'], shoe['releaseDate'], shoe['retailPrice'], shoe['estimatedMarketValue'], shoe['story'], shoe['image']['original']
    
  df.set_index('name', inplace=True)

  return df

In [None]:
def collectData2(): 
  import pandas as pd

  #Update with your own path if you don't want to pull your own data!
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data/finalProjectImageData.csv')
  
  df.set_index('name', inplace=True)

  return df

# Step 2: Binning Groups


In [None]:
def bin_gender(df, col, percent=0.05):
  import pandas as pd

  df.loc[df[col] == 'Mens', col] = 'men'
  df.loc[df[col] == 'infant', col] = 'toddler'

  for group, count in df[col].value_counts().iteritems():
    if count / len(df) < percent:
      df.loc[df[col] == group, col] = 'Other'

  return df

In [None]:
def bin_silhouette(df, col, percent=0.05):
  df[[col]] = df[[col]].astype(str) 
  for shoe in df.itertuples(): 
    if shoe[2].find('Jordan 1 ') != -1 or shoe[2].find('Air Force 1') != -1:
      df.loc[shoe[0], col] = 'Air Jordan 1' 

    if shoe[2].find('jordan 6') != -1:
      df.loc[shoe[0], col] = 'Air Jordan 6'

  for group, count in df[col].value_counts().iteritems():
    if count / len(df) < percent:
      df.loc[df[col] == group, col] = 'Other'

  return df
  

# Step 3: Fix Dates

In [None]:
def fix_dates(df, col):
  import datetime 
  from dateutil.relativedelta import relativedelta
  from datetime import date
  
  date = datetime.datetime.strptime('2022-01-01', "%Y-%m-%d")
  dates_list = []
  compared_dates = []

  #loop through each row of the specified column
  for row in df[col] :
    #make two lists of the date in the row and the date to compare it to
    row = row[:10]
    row_date = datetime.datetime.strptime(row, "%Y-%m-%d")
    difference = date - row_date
    dates_list.append(difference.days)

  #replace it with the difference dataframe in days
  df[col] = dates_list

  return df


# Step 4: Adjust for Skewness

In [None]:
def fix_skewness(df): 
  import numpy as np

  # Cast both as type int (some values are weird)
  df[['estimatedMarketValue']] = df[['estimatedMarketValue']].astype(int) 
  df[['retailPrice']] = df[['retailPrice']].astype(int) 

  df['estimatedMarketValue'] = np.log(df['estimatedMarketValue'] + 1)

  return df

# Step 5: Text Analytics


In [None]:
def sent_to_words(sentences):
  import sys, re, numpy as np
  import pandas as pd
  from pprint import pprint
  import gensim, spacy, logging, warnings, en_core_web_sm
  import gensim.corpora as corpora
  from gensim.utils import lemmatize, simple_preprocess
  from gensim.models import CoherenceModel
  import matplotlib.pyplot as plt
  from nltk.corpus import stopwords
  import nltk

  for sent in sentences:
    sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
    sent = re.sub('\s+', ' ', sent)  # remove newline chars
    sent = re.sub("\'", "", sent)  # remove single quotes
    sent = re.sub('\"', "", sent) # remove double quotes
    sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
    yield(sent)

In [None]:
def process_words(texts, stop_words, bigram_mod, trigram_mod, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
  import sys, re, numpy as np
  import pandas as pd
  from pprint import pprint
  import gensim, spacy, logging, warnings, en_core_web_sm
  import gensim.corpora as corpora
  from gensim.utils import lemmatize, simple_preprocess
  from gensim.models import CoherenceModel
  import matplotlib.pyplot as plt
  from nltk.corpus import stopwords
  import nltk
  """Remove Stopwords, Form Bigrams, Trigrams and perform Lemmatization"""
  texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
  texts = [bigram_mod[doc] for doc in texts]
  texts = [trigram_mod[bigram_mod[doc]] for doc in texts]

  texts_out = []
  nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])    # Load spacy, but we don't need the parser or NER (named entity extraction) modules

  for sent in texts:
    doc = nlp(" ".join(sent))
    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])

  # remove stopwords once more after lemmatization
  texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]
  return texts_out

In [None]:
def text_analytics(df):
  import sys, re, numpy as np
  import pandas as pd
  from pprint import pprint
  import gensim, spacy, logging, warnings, en_core_web_sm
  import gensim.corpora as corpora
  from gensim.utils import lemmatize, simple_preprocess
  from gensim.models import CoherenceModel
  import matplotlib.pyplot as plt
  from nltk.corpus import stopwords
  import nltk
  nltk.download('stopwords')
  stop_words = stopwords.words('english')

  stop_words.extend(['air', 'heel', 'upper', 'leather', 'black', 'tongue', 'white', 'feature', 'color', 'outsole', 'retro',
                     'midsole', 'sneaker', 'design', 'rubber', 'red', 'branding', 'contrast', 'overlay', 'make', 'top', 'collar',
                     'blue', 'shoe', 'colorway', 'release', 'finish', 'jumpman', 'signature', 'detail', 'logo', 'tag', 'swoosh',
                     'low', 'high', 'inspire', 'accent', 'nike', 'original', 'silhouette', 'classic', 'base','flap', 'foam', 'wing',
                     'lateral', 'mid', 'include', 'hue', 'green', 'suede', 'visible', 'panel', 'weave', 'unit', 'silver', 'embroider',
                     'deliver', 'take', 'mesh', 'scheme', 'combine', 'side', 'sole', 'metallic', 'gold', 'look', 'nubuck', 'element',
                     'hit', 'textile', 'yellow', 'construct', 'cushioning', 'woman', 'wear', 'iconic', 'ride', 'zoom', 'print',
                     'translucent', 'dark', 'grey', 'patent', 'purple', 'toe', 'forefoot', 'pop', 'underfoot', 'update', 'build',
                     'tumble', 'pattern', 'apply', 'give', 'offer', 'quarter', 'pink', 'complete', 'vibrant', 'royal', 'ankle',
                     'perforate', 'midfoot', 'lace', 'cupsole', 'synthetic', 'version', 'stitch', 'icon', 'appear', 'provide', 'first', 
                     'brand', 'university', 'lightweight', 'strap', 'box', 'style', 'celebrate', 'support', 'two_tone', 'edition',
                     'mudguard', 'traditional', 'game','model', 'reflective', 'kid', 'fit', 'premium', 'equip', 'material', 'encapsulate',
                     'cover', 'responsive', 'pair', 'match', 'court', 'present', 'highlight', 'additional', 'bright'])

  data = df.story.values.tolist()

  data_words = list(sent_to_words(data))

  bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
  trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
  bigram_mod = gensim.models.phrases.Phraser(bigram)
  trigram_mod = gensim.models.phrases.Phraser(trigram)

  data_ready = process_words(data_words, stop_words, bigram_mod, trigram_mod)

  id2word = corpora.Dictionary(data_ready)

  corpus = [id2word.doc2bow(text) for text in data_ready]

  # CODE TO FIND NUMBER OF TOPICS

  #df_fit = pd.DataFrame(columns=['topics', 'perplexity', 'coherence'])

  # for n in range(3, 10):
  #   lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
  #                                               id2word=id2word,
  #                                               num_topics=n,
  #                                               random_state=12345,
  #                                               chunksize=20,
  #                                               passes=10,
  #                                               per_word_topics=True)

  # # # #   # Generate fit metrics
  #   coherence_model_lda = CoherenceModel(model=lda_model, texts=data_ready, dictionary=id2word, coherence='c_v')
  # # # #   # Add metrics to df_fit
  #   df_fit.loc[n - 3] = [n, round(lda_model.log_perplexity(corpus), 3), round(coherence_model_lda.get_coherence(), 3)]


  # print(df_fit)

  topics = 3

  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                              id2word=id2word,
                                              num_topics=topics,
                                              random_state=12345,
                                              chunksize=20,
                                              passes=10,
                                              per_word_topics=True)

  ldatopics = lda_model.show_topics(formatted=False)

  num_topics = len(lda_model.get_topics()) # store the number of topics from the last model
  for col in range(num_topics): # generate a new column for each topic
    df[f'topic_{col + 1}'] = 0.0

  # Store the topic score and dominant topic
  for i, words in enumerate(data_ready):
    doc = lda_model[id2word.doc2bow(words)] # generate a corpus for this document set of words

    for j, score in enumerate(doc[0]): # for each document in the corpus
      # Get the topic score and store it in the appropriate column
      df.iat[i, (len(df.columns) - ((num_topics) - score[0]))] = score[1]


  # Code to figure out stopwords that need to be removed!

  # from collections import Counter
  # from matplotlib import pyplot as plt
  # from wordcloud import WordCloud, STOPWORDS
  # import matplotlib.colors as mcolors
  # topics = lda_model.show_topics(formatted=False)
  # data_flat = [w for w_list in data_ready for w in w_list]
  # counter = Counter(data_flat)

  # out = []
  # for i, topic in topics:
  #     for word, weight in topic:
  #         out.append([word, i , weight, counter[word]])

  # df_words = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])

  # # Plot Word Count and Weights of Topic Keywords
  # fig, axes = plt.subplots(1, 3, figsize=(20,7), sharey=True, dpi=160)
  # cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
  # for i, ax in enumerate(axes.flatten()):
  #     ax.bar(x='word', height="word_count", data=df_words.loc[df_words.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
  #     ax_twin = ax.twinx()
  #     ax_twin.bar(x='word', height="importance", data=df_words.loc[df_words.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
  #     ax.set_ylabel('Word Count', color=cols[i])
  #     # ax_twin.set_ylim(0, 0.030); ax.set_ylim(0, 3500)
  #     ax.set_title('Topic: ' + str(i + 1), color=cols[i], fontsize=16)
  #     ax.tick_params(axis='y', left=False)
  #     ax.set_xticklabels(df_words.loc[df_words.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
  #     ax.legend(loc='upper center'); ax_twin.legend(loc='upper right')

  # fig.tight_layout(w_pad=2)
  # fig.suptitle('Word Count and Importance of Topic Keywords', fontsize=22, y=1.05)
  # plt.show()

  return df



In [None]:
def sentiment_calc(df):
  import nltk
  from nltk.sentiment import SentimentIntensityAnalyzer

  # Word lists and lexicons in nltk: https://www.nltk.org/howto/corpus.html#word-lists-and-lexicons
  nltk.download('vader_lexicon')
  sia = SentimentIntensityAnalyzer()
  sia.polarity_scores("This is a really great story!")

  df['sentiment_overall'] = 0.0
  df['sentiment_neg'] = 0.0
  df['sentiment_neu'] = 0.0
  df['sentiment_pos'] = 0.0

  for row in df.itertuples():
    sentiment = sia.polarity_scores(row[6])
    df.loc[row[0], 'sentiment_overall'] = sentiment['compound']
    df.loc[row[0], 'sentiment_neg'] = sentiment['neg']
    df.loc[row[0], 'sentiment_neu'] = sentiment['neu']
    df.loc[row[0], 'sentiment_pos'] = sentiment['pos']

  return df

In [None]:
def word_count(df):

  df['story_length'] = 0

  for row in df.itertuples(): 
    df.loc[row[0], 'story_length'] = len(df.loc[row[0], 'story'])
  
  return df

In [None]:
def identify_entities(df):
  # Identifies the entities that make up the text (names, organizations, countries, etc.)

  df['org_count_text'] = 0
  df['gpe_count_text'] = 0
  df['product_count_text'] = 0
  df['loc_count_text'] = 0
  df['date_count_text'] = 0
  df['ordinal_count_text'] = 0
  df['money_count_text'] = 0
  df['person_count_text'] = 0

  import spacy
  from spacy import displacy
  for row in df.itertuples():
    NER = spacy.load("en_core_web_sm")
    text1= NER(row[6])

    for word in text1.ents:
      if word.label_ == 'ORG':
        df.loc[row[0], 'org_count_text'] = df.loc[row[0], 'org_count_text'] + 1
      elif word.label_ == 'GPE':
        df.loc[row[0], 'gpe_count_text'] = df.loc[row[0], 'gpe_count_text'] + 1
      elif word.label_ == 'PRODUCT':
        df.loc[row[0], 'product_count_text'] = df.loc[row[0], 'product_count_text'] + 1
      elif word.label_ == 'LOC':
        df.loc[row[0], 'loc_count_text'] = df.loc[row[0], 'loc_count_text'] + 1
      elif word.label_ == 'DATE':
        df.loc[row[0], 'date_count_text'] = df.loc[row[0], 'date_count_text'] + 1
      elif word.label_ == 'ORDINAL':
        df.loc[row[0], 'ordinal_count_text'] = df.loc[row[0], 'ordinal_count_text'] + 1
      elif word.label_ == 'MONEY':
        df.loc[row[0], 'money_count_text'] = df.loc[row[0], 'money_count_text'] + 1
      elif word.label_ == 'PERSON':
        df.loc[row[0], 'person_count_text'] = df.loc[row[0], 'person_count_text'] + 1
    
    #displacy.render(text1,style="ent",jupyter=True)

  # Drops story and images once the image processing and text anayltics is complete
  df.drop(['story', 'image'], axis=1, inplace=True)

  return df

# Step 6: Image Analytics

In [None]:
def image_analytics(df):
  import json
  import requests

  df['color_variance'] = 0

  for i, shoe in enumerate(df.itertuples()):
    try: 
      image_url = shoe[7]

      if i % 3 == 1:
        api_key = 'acc_58e22a11bd859d6' 
        api_secret = '3f6951c2a8a851bf5e91199b141ca3d6'
      elif i % 3 == 2: 
        api_key = 'acc_12406f00a60ea3d'
        api_secret = '767a4f200acdfa7d96feaba046b72302'
      else: 
        api_key = 'acc_1d66baefb178be0'
        api_secret = '92278e0f8f4ed443e42526fd40b67801'
      

      response = requests.get(
          'https://api.imagga.com/v2/colors?overall_count=7&separated_count=5&image_url=%s' % image_url,
          auth=(api_key, api_secret))

      json_data = response.json()

      # Focus on colors of the image, not background colors
      for foreground_color in json_data['result']['colors']['foreground_colors']:

        # Better to use parent color instead of actual color so that we don't have as many columns
        parent_color = foreground_color['closest_palette_color_parent']
        percentage = foreground_color['percent']

        # Create new columns/Add percentages
        if ("color_" + parent_color) in df.columns: 
          df.loc[shoe[0], ("color_" + parent_color)] = percentage
        else: 
          df["color_" + parent_color] = 0
          df.loc[shoe[0], ("color_" + parent_color)] = percentage
      
      # Add the color variance as well
      color_variance = json_data['result']['colors']['color_variance']
    
      df.loc[shoe[0], 'color_variance'] = color_variance

    except: 
      df.drop(shoe[0], inplace=True)

  return df

# Step 7: Fill Missing Values

In [None]:
def impute_reg(df, label):
  from sklearn.experimental import enable_iterative_imputer
  from sklearn.impute import IterativeImputer
  from sklearn.preprocessing import MinMaxScaler
  import pandas as pd

  # Dummy code first; categorical features not allowed
  for col in df:
    # We use this code for both regression and classification, so if we are using 'gender' as a label,
    # we don't want to dummy code it
    if not pd.api.types.is_numeric_dtype(df[col]) and (label != 'gender' or col != 'gender'):
      df = pd.get_dummies(df, columns=[col], drop_first=True)

  # Scaling is unnecessary for regression-based imputation

  # Save the gender values
  if label == 'gender':
    gender_column = df['gender'].values
    df.drop(columns=['gender'], inplace=True)

  imp = IterativeImputer(max_iter=10, random_state=12345)
  df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)

  # Replace the gender values after imputing
  if label == 'gender':
    df['gender'] = gender_column

  return df

# Step 8: Feature Selection

In [None]:
def feature_selection_variance(df, label="", p=0.8):
  from sklearn.feature_selection import VarianceThreshold
  import pandas as pd
  
  if label != "":
    X = df.drop(columns=[label])
      
  sel = VarianceThreshold(threshold=(p * (1 - p)))
  sel.fit_transform(X)
  
  # Add the label back in after removing poor features
  return df[sel.get_feature_names_out()].join(df[label])

# Step 9: Modeling

In [None]:
def fit_crossvalidate_reg(df, label, k=10, n=5, repeat=True):
  import sklearn.linear_model as lm, sklearn.ensemble as se
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean, std
  from xgboost import XGBRegressor

  X = df.drop(columns=[label])
  y = df[label]

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=n, random_state=12345)
  else:
    cv = KFold(n_splits=k, random_state=12345, shuffle=True)

  fit = {}
  model = {}

  model_lr = lm.LinearRegression()
  model_ridge = lm.Ridge()
  model_lasso = lm.Lasso(alpha=0.1)
  model_ada = se.AdaBoostRegressor(random_state=12345, n_estimators=100)
  model_xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
  model_br = lm.BayesianRidge()
  model_pr = lm.TweedieRegressor(power=1, link="log") # Power=1 means this is a Poisson
  model_igr = lm.TweedieRegressor(power=3) # Power=3 means this is an inverse Gamma
               
  fit['MLR'] = mean(cross_val_score(model_lr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Ridge'] = mean(cross_val_score(model_ridge, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Lasso'] = mean(cross_val_score(model_lasso, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['AdaBoost'] = mean(cross_val_score(model_ada, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Bayesian'] = mean(cross_val_score(model_br, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Poisson'] = mean(cross_val_score(model_pr, X, y, scoring='r2', cv=cv, n_jobs=-1))
  fit['Inverse'] = mean(cross_val_score(model_igr, X, y, scoring='r2', cv=cv, n_jobs=-1))

  model['MLR'] = model_lr
  model['Ridge'] = model_ridge
  model['Lasso'] = model_lasso
  model['AdaBoost'] = model_ada
  model['XGBoost'] = model_xgb
  model['Bayesian'] = model_br
  model['Poisson'] = model_pr
  model['Inverse'] = model_igr

  df_fit = pd.DataFrame({'R-squared':fit})
  df_fit = df_fit.sort_values(by=['R-squared'], ascending=False)

  print('Regression Models')
  print(df_fit)

  best_model = df_fit.index[0]
  return model[best_model].fit(X, y)

In [None]:
def fit_crossvalidate_clf(df, label, k=10, n=5, repeat=True):
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean, std
  from xgboost import XGBClassifier
  from sklearn.neighbors import KNeighborsClassifier
  import sklearn.linear_model as lm, sklearn.ensemble as en

  X = df.drop(columns=[label])
  y = df[label]

  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=n, random_state=12345)
  else:
    cv = KFold(n_splits=k, random_state=12345, shuffle=True)

  fit = {}
  model = {}

  model_log = lm.LogisticRegression(max_iter=100)
  model_knn = KNeighborsClassifier(n_neighbors=3)
  model_ridge = lm.RidgeClassifier()
  model_ada = en.AdaBoostClassifier(n_estimators=100, random_state=12345)
  model_gb = en.GradientBoostingClassifier(random_state=12345)
  model_xgb = XGBClassifier(objective = 'binary:logistic')

  fit['Logistic'] = mean(cross_val_score(model_log, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['KNN'] = mean(cross_val_score(model_knn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['Ridge'] = mean(cross_val_score(model_ridge, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['AdaBoost'] = mean(cross_val_score(model_ada, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['GradBoost'] = mean(cross_val_score(model_gb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
  fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))

  model['Logistic'] = model_log
  model['KNN'] = model_knn
  model['Ridge'] = model_ridge
  model['AdaBoost'] = model_ada
  model['GradBoost'] = model_gb
  model['XGBoost'] = model_xgb

  df_fit = pd.DataFrame({'Accuracy':fit})
  df_fit = df_fit.sort_values(by=['Accuracy'], ascending=False)

  print('Classification Models')
  print(df_fit)

  best_model = df_fit.index[0]
  return model[best_model].fit(X, y)

In [None]:
def clustering_model_kmeans(df, num_clusters):   
  import plotly.express as px
  import sklearn.cluster as cluster
  from scipy.spatial import distance as sdist

  kmeans = cluster.KMeans(n_clusters=num_clusters, random_state=12345).fit(df)

  df_w_cluster = df.copy()

  df_w_cluster['cluster'] = kmeans.fit_predict(df)
  
  return kmeans

In [None]:
# Recommends about 5 clusters
def clustering_model_test_cal_and_har(df):
  import pandas as pd
  import plotly.express as px
  import seaborn as sns
  import sklearn.cluster as cluster
  from sklearn.cluster import KMeans
  from scipy.spatial import distance as sdist
  from sklearn.metrics import calinski_harabasz_score
  from matplotlib import pyplot as plt
  from sklearn.metrics import silhouette_score

  ch_score = []
  for n in range(2, 21):
    kmeans = KMeans(n, random_state=12345).fit(df)
    ch_score.append(calinski_harabasz_score(df, labels=kmeans.labels_))
  
  plt.plot(range(2, 21), ch_score, 'bx-')
  plt.xlabel('number of clusters') 
  plt.ylabel('Calinski_Harabasz Criterion') 
  plt.title('Optimal Number of Clusters')
  plt.text(12, 40, 'Higher is better', bbox=dict(facecolor='red', alpha=0.5))
  plt.show()

  return df

In [None]:
#Recommends 2-3
def clustering_model_test_sil(df):
  from sklearn.metrics import silhouette_score
  import pandas as pd
  import plotly.express as px
  import seaborn as sns
  import sklearn.cluster as cluster
  from sklearn.cluster import KMeans
  from scipy.spatial import distance as sdist
  from sklearn.metrics import calinski_harabasz_score
  from matplotlib import pyplot as plt
  from sklearn.metrics import silhouette_score
  
  si_score = []
  for n in range(2, 21):
    kmeans = KMeans(n, random_state=12345).fit(df)
    si_score.append(silhouette_score(df, kmeans.labels_))
  
  plt.plot(range(2, 21), si_score, 'bx-')
  plt.xlabel('number of clusters') 
  plt.ylabel('Silhouette score') 
  plt.title('Optimal Number of Clusters')
  plt.text(11, .14, 'Higher is better', bbox=dict(facecolor='red', alpha=0.5))
  plt.show()

In [None]:
#Recommends around 4
def clustering_model_test_elbow(df):
  from sklearn.metrics import silhouette_score
  import pandas as pd
  import plotly.express as px
  import seaborn as sns
  import sklearn.cluster as cluster
  from sklearn.cluster import KMeans
  from scipy.spatial import distance as sdist
  from sklearn.metrics import calinski_harabasz_score
  from matplotlib import pyplot as plt
  from sklearn.metrics import silhouette_score

  ss_score = []
  for n in range(2,21):
      kmeans = KMeans(n, random_state=12345).fit(df)
      ss_score.append(kmeans.inertia_)
      
  # Where does the slope bend? Find the highest (least negative) slope.
  changes = []
  for n in range(2, 20):
    changes.append(float(ss_score[n - 1] - ss_score[n - 2]))

  optimal_n = changes.index(max(changes))

  plt.plot(range(2,21), ss_score, 'bx-', markevery=[optimal_n])
  plt.xlabel('number of clusters')
  plt.ylabel('SS distance')
  plt.title('Optimal Number of Clusters')
  plt.text(8, 900, 'The point where slope "bends" from a \ndecreasing to increasing rate of change', bbox=dict(facecolor='red', alpha=0.5))
  plt.show()

# Step 10: Save Model

In [None]:
def dump_pickle(model, file_name):
  import pickle
  pickle.dump(model, open(file_name, "wb"))

def load_pickle(file_name):
  import pickle
  model = pickle.load(open(file_name, "rb"))
  return model

# Step 11: Function Calls


In [None]:
# You may need to uncomment cell 1 and install the necessary packages, then restart runtime, comment those lines out again, then run the whole thing

df = collectData(url='https://the-sneaker-database.p.rapidapi.com/sneakers', numberOfRows=1350, brand=['jordan']) #Must pull in more than 550 rows because of missing values
df = image_analytics(df)

# Collecting data and the image analytics can take about 1 hour to run. 
# If you would prefer not to wait for that, use the finalProjectImageData.csv
# This csv has just the raw data collected and also the image analytics pulled from Imagga
# Make sure to set 'name' as the new ID if you pull this yourself
# You can also uncomment this next function, comment the first two lines, and it will pull from that csv
# df = collectData2()

# Clean Data
df = bin_gender(df, col='gender', percent=0.05)
df = bin_silhouette(df, col='silhouette', percent=0.04)
df = fix_dates(df, col='releaseDate')
df = fix_skewness(df)
df = text_analytics(df)
df = sentiment_calc(df)
df = word_count(df)
df = identify_entities(df)

df_dummy_codes_market_value = impute_reg(df, 'estimatedMarketValue')
df_features_market_value = feature_selection_variance(df_dummy_codes_market_value, 'estimatedMarketValue', p=0.8)
model_reg = fit_crossvalidate_reg(df_features_market_value, 'estimatedMarketValue')
dump_pickle(model_reg, 'best_reg_model.sav')

df_dummy_codes_gender = impute_reg(df, 'gender')
df_features_gender = feature_selection_variance(df_dummy_codes_gender, 'gender', p=0.8)
model_clf = fit_crossvalidate_clf(df_features_gender, 'gender')
dump_pickle(model_clf, 'best_clf_model.sav')

# This commented code is for discovering the best number of clusters for our cluster model
# clustering_model_test_cal_and_har(df_dummy_codes_market_value)
# clustering_model_test_sil(df_dummy_codes_market_value)
# clustering_model_test_elbow(df_dummy_codes_market_value)

model_cluster = clustering_model_kmeans(df_dummy_codes_market_value, num_clusters=3)
dump_pickle(model_cluster, 'best_cluster_model.sav')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



For a faster implementation, use the gensim.models.phrases.Phraser class



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Regression Models
          R-squared
XGBoost    0.480028
AdaBoost   0.389299
Ridge      0.278936
MLR        0.278002
Bayesian   0.274442
Lasso      0.270273
Inverse   -0.034402
Poisson   -0.034402
Classification Models
           Accuracy
XGBoost    0.753298
GradBoost  0.743668
Logistic   0.633203
AdaBoost   0.614218
KNN        0.550476
Ridge      0.548182
