In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import precision_score, make_scorer, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier

from imblearn.pipeline import Pipeline as ImPipeline

from wordcloud import WordCloud

In [2]:
df1 = pd.read_csv('./data/meta_content.csv', converters={'category': eval, 'also_buy': eval,'also_view': eval, 'details': eval})
df1.head()

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,asin,details
0,"[Movies & TV, Movies]",,Understanding Seizures and Epilepsy,[],,"886,503 in Movies & TV (",[],695009,{}
1,"[Movies & TV, Movies]",,Spirit Led&mdash;Moving By Grace In The Holy S...,[],,"342,688 in Movies & TV (",[],791156,{}
2,"[Movies & TV, Movies]",Disc 1: Flour Power (Scones; Shortcakes; South...,My Fair Pastry (Good Eats Vol. 9),[],Alton Brown,"370,026 in Movies & TV (",[],143529,{}
3,"[Movies & TV, Movies]",Barefoot Contessa Volume 2: On these three dis...,"Barefoot Contessa (with Ina Garten), Entertain...","[B002I5GNW4, B005WXPVMM, B009UY3W8O, B00N27ID1...",Ina Garten,"342,914 in Movies & TV (","[B002I5GNW4, 0804187045, B009UY3W8O, 060960219...",143588,{}
4,"[Movies & TV, Movies]",Rise and Swine (Good Eats Vol. 7) includes bon...,Rise and Swine (Good Eats Vol. 7),"[B000P1CKES, B000NR4CRM]",Alton Brown,"351,684 in Movies & TV (",[B0015SVNXY],143502,{}


In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181839 entries, 0 to 181838
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     181839 non-null  object
 1   description  156721 non-null  object
 2   title        181781 non-null  object
 3   also_buy     181839 non-null  object
 4   brand        120867 non-null  object
 5   rank         181839 non-null  object
 6   also_view    181839 non-null  object
 7   asin         181839 non-null  object
 8   details      181839 non-null  object
dtypes: object(9)
memory usage: 12.5+ MB


In [4]:
df_content = df1.copy()

In [6]:
df_asin = pd.read_csv('./data/reviews_streaming.csv')
df_asin.head()

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewText,summary,vote
0,5,"10 24, 2017",A235CSFIMYI5JK,B00SU1DDVY,I love this movie. This man was awesome. Super...,I love this movie,0.0
1,5,"05 15, 2017",A1H41LX36UOWYA,B00SU1DDVY,I love this movie. I first saw it on Netflix ...,Movie,0.0
2,5,"04 3, 2017",A1TSMQSMIREU02,B00SU1DDVY,This movie was a great adventure despite havin...,Five Stars,0.0
3,5,"08 15, 2016",A2K62IHNYD9CBR,B00SU1DDVY,Must have watched five times by now,Five Stars,0.0
4,5,"05 27, 2016",AOGNYFSGIZGT4,B00SU1DDVY,Absolutely astounding and moving. Shows how ba...,and tactics of the free and better than the ri...,0.0


In [7]:
movie_streams = df_asin['asin'].unique().tolist()

In [8]:
df_content_filtered = df_content[df_content['asin'].isin(movie_streams)]
df_content_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2374 entries, 3154 to 181832
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     2374 non-null   object
 1   description  2260 non-null   object
 2   title        2374 non-null   object
 3   also_buy     2374 non-null   object
 4   brand        2076 non-null   object
 5   rank         2374 non-null   object
 6   also_view    2374 non-null   object
 7   asin         2374 non-null   object
 8   details      2374 non-null   object
dtypes: object(9)
memory usage: 185.5+ KB


In [9]:
df_content_filtered['category']

3154      [Movies & TV, Genre for Featured Categories, A...
62174     [Movies & TV, Genre for Featured Categories, C...
64200     [Movies & TV, Genre for Featured Categories, C...
81061     [Movies & TV, Musicals & Performing Arts, Musi...
82519     [Movies & TV, Genre for Featured Categories, S...
                                ...                        
181812    [Movies & TV, Genre for Featured Categories, D...
181815    [Movies & TV, Studio Specials, Sony Pictures H...
181824                                                   []
181830                       [Movies & TV, Blu-ray, Movies]
181832                   [Movies & TV, HBO, All HBO Titles]
Name: category, Length: 2374, dtype: object

In [10]:
df_content_filtered.to_csv('./data/meta_streams.csv', encoding='utf-8', index=False)

In [14]:
df_test = pd.read_csv('./data/meta_streams.csv', converters={'category': eval, 'also_buy': eval,'also_view': eval, 'details': eval})
df_test.head()

Unnamed: 0,category,description,title,also_buy,brand,rank,also_view,asin,details
0,"[Movies & TV, Genre for Featured Categories, A...",Kids on an outing in the forest come up agains...,Bridge to Nowhere VHS,[],Matthew Hunter,"377,418 in Movies & TV (",[],6300133613,{}
1,"[Movies & TV, Genre for Featured Categories, C...","Alberto (Oscar Jaenada, Noviembre) and Susana ...",El Juego De La Verdad,[],Tristan Ulloa,"232,919 in Movies & TV (",[],B000G8NXOQ,"{'Language:': 'Spanish', 'Subtitles:': 'Englis..."
2,"[Movies & TV, Genre for Featured Categories, C...",A hard nosed FBI agent and her dim witted part...,Nosferatu's Crush,[],"Malissa Longo, John Gaydos, Lady Altovise, Ang...","308,791 in Movies & TV (","[B07FNRHK7M, B07GJNR9R6, B07JJ6HQ7Q, B07HGBTYH...",B000HEVATO,"{'Subtitles:': 'English', 'ASIN: ': 'B000HEVATO'}"
3,"[Movies & TV, Musicals & Performing Arts, Musi...",Competition is hot in the rodeo business as we...,Lights From Old Sante Fe 1944,[],Roy Rogers,"435,230 in Movies & TV (",[],B000W92BD8,{'ASIN: ': 'B000W92BD8'}
4,"[Movies & TV, Genre for Featured Categories, S...","VILLE DE QUEBEC On behalf of the French crown,...",Vista Point Town Of QUEBEC Canada,[],,"515,573 in Movies & TV (",[],B000XJO1GM,"{'Language:': 'English', 'Subtitles:': 'Englis..."


In [16]:
df_test['category'].apply(type).unique()

array([<class 'list'>], dtype=object)