In [3]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import precision_score, make_scorer, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier

from imblearn.pipeline import Pipeline as ImPipeline

from wordcloud import WordCloud

In [4]:
df1 = pd.read_csv('./data/meta_allstreams.csv', converters={'feature': eval,'category': eval, 'also_buy': eval,'also_view': eval, 'details': eval})
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28129 entries, 0 to 28128
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category     28129 non-null  object
 1   description  26198 non-null  object
 2   title        28128 non-null  object
 3   also_buy     28129 non-null  object
 4   brand        24829 non-null  object
 5   feature      28129 non-null  object
 6   rank         28129 non-null  object
 7   also_view    28129 non-null  object
 8   asin         28129 non-null  object
 9   details      28129 non-null  object
dtypes: object(10)
memory usage: 2.1+ MB


In [3]:
df1['category'][999]

['Movies & TV', 'Genre for Featured Categories', 'Comedy']

In [308]:
df1['details'].apply(type).unique()

array([<class 'dict'>], dtype=object)

In [126]:
df_check_genres = df1[df1['category'].apply(lambda x: 'Exercise & Fitness' in x)]
df_check_genres['category'].value_counts()

category
[Movies & TV, Genre for Featured Categories, Exercise & Fitness]    28
Name: count, dtype: int64

In [5]:
df = df1.drop(columns = ['also_buy', 'also_view'], axis=1)
#df = df1.drop(columns = ['also_buy', 'also_view', 'details'], axis=1)
df.rename(columns={'category': 'genre', 'brand': 'starring'}, inplace=True)

In [6]:
df['genre'] = [x[1:] if len(x) > 1 and x[0] == 'Movies & TV' else x for x in df['genre']]

In [7]:
df = df[~df['genre'].apply(lambda x: 'Exercise & Fitness' in x)]

In [8]:
df.loc[df['genre'].apply(lambda x: isinstance(x, list) and len(x) > 2 and x[0] == 'Art House & International'), 'genre'] = df['genre'].apply(lambda x: [x[0] + ' ' + x[2]] if len(x) > 2 else x)
df['genre'] = df['genre'].apply(lambda x: x[:1] + x[2:] if isinstance(x, list) and len(x) > 2 and x[0] == 'Art House & International' and len(x) > 2 else x)

In [9]:
def remove_genre_from_list(lst):
    return [genre for genre in lst if genre not in remove_genre]

In [10]:
remove_genre = ['Genre for Featured Categories', 'Boxed Sets', 'All Titles', 
                'Independently Distributed', 'All Sony Pictures Titles', 'Studio Specials',
               'By Original Language', 'All BBC Titles', 'General',
               '20th Century Fox Home Entertainment', 'Family Features',
               'Lionsgate Home Entertainment', 'Fully Loaded DVDs', 'Blu-ray',
               'Holidays & Seasonal', 'All HBO Titles', 'Musicals & Performing Arts',
               'Paramount Home Entertainment', 'Music Artists', 'Walt Disney Studios Home Entertainment',
               'All', 'Universal Studios Home Entertainment'] 

df['genre'] = df['genre'].apply(lambda x: remove_genre_from_list(x) if isinstance(x, list) else x)

In [11]:
df['rank'] = df['rank'].str.extract(r'([\d,]+)', expand=False).str.replace(',', '').replace('', np.nan).astype(float).astype('Int64')
df['rank'].fillna(int(df['rank'].median()), inplace=True)

In [12]:
df.replace({'description': "Quick Shipping !!! New And Sealed !!! This Disc WILL NOT play on standard US DVD player. A multi-region PAL/NTSC DVD player is request to view it in USA/Canada. Please Review Description."}, np.nan, inplace=True)
df.replace({'description': "Original Reliance DVD"}, np.nan, inplace=True)
df.replace({'description': "Like New Condition! Quick Shipping (Within 24-48hrs). DVD Case & Artwork In Excellent Condition! Previous Rental Some items include Azura Disc Scratch Protection. Digital Copy not available!"}, np.nan, inplace=True)
df.replace({'description': "Original Yashraj DVD"}, np.nan, inplace=True)
df.replace({'description': "Original Eros DVD"}, np.nan, inplace=True)
df.replace({'description': "<i>When sold by Amazon.com, this product will be manufactured on demand using DVD-R recordable media. Amazon.com's standard return policy will apply.</i>"}, np.nan, inplace=True)
df.replace({'description': "NOTICE: Polish Release, cover may contain Polish text/markings. The disk has English audio."}, np.nan, inplace=True)

In [13]:
df.replace({'description': "DVD"}, np.nan, inplace=True)
df.replace({'description': "dvd"}, np.nan, inplace=True)
df.replace({'description': "BRAY"}, np.nan, inplace=True)

In [14]:
convert = ['.', '\n', '-', '--', 'Na', 'BRIDGESTONE MULTIMEDIA']
df['starring'] = df['starring'].apply(lambda x: 'Various' if isinstance(x, str) and (x in convert or '\n' in x) else x)
df['starring'].fillna('Various', inplace=True)

In [16]:
df['details'] = df['details'].apply(lambda x:
                      {k:v for k,v in x.items() if not str(k).strip().startswith('ASIN:')})
df['details'] = [{'Language:': 'English'} if not d else d for d in df['details']]

In [17]:
df['english'] = df['details'].apply(lambda x: 1 if ('Language:' in x and x['Language:'] == 'English') or (isinstance(x, str) and 'English' in x.split(', ')) else 0)
df['eng_sub'] = df['details'].apply(lambda x: 1 if ('Subtitles:' in x and x['Subtitles:'] == 'English') or (isinstance(x, str) and 'English' in x.split(', ')) else 0)
df['eng_dub'] = df['details'].apply(lambda x: 1 if ('Dubbed:' in x and x['Dubbed:'] == 'English') or (isinstance(x, str) and 'English' in x.split(', ')) else 0)
df.drop(columns = 'details', axis=1, inplace=True)
df.head()

Unnamed: 0,genre,description,title,starring,feature,rank,asin,english,eng_sub,eng_dub
0,[Art House & International Spanish],,Peace Child VHS,Various,[],866012,1527665,1,0,0
1,[Christian Video],An early movie edition of the life of Jesus.,Where Jesus Walked VHS,Various,[],1108385,5000009,1,0,0
2,[Movies],"In Depression-era New England, a miserly busin...",An American Christmas Carol VHS,Various,[],704028,5019281,1,0,0
3,[Documentary],This documentary takes you on a journey...from...,A NATION ADRIFT A Chronicle of America's Prov...,Tom Kane,[],496880,5092663,1,0,0
4,"[Science Fiction & Fantasy, Science Fiction, A...",This is The VHS Movie: SANTA CLAUS IS COMIN TO...,Santa Claus Is Comin' To Town VHS,Fred Astaire,[],347709,307142493,1,0,0


In [22]:
df['feature'].value_counts()

feature
[]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [400]:
df[df['description'].isnull()]

Unnamed: 0,genre,description,title,starring,rank,asin,english,eng_sub,eng_dub
6,[Horror],,Mad Monster Rally,Various,57899,B002BFBAYW,1,0,0
7,[Art House & International Spanish],,Pan's Labyrinth [Blu-ray] [Blu-ray] (2009),Various,32994,B002BYYA9A,0,1,0
12,[Art House & International Japanese],,Legend of the Fist,Various,318460,B004C9MCGA,0,1,0
18,[Movies],,Jack Frost [DVD],Various,55459,B007I6G08U,1,0,0
22,[Movies],,How To Use Guys With Secret Tips (Region 3 DVD...,Various,761370,B00J9IRCWS,1,0,0
...,...,...,...,...,...,...,...,...,...
2279,[Movies],,Don't Look In The Basement 2 Blu-Ray,Various,294028,B01G9DG5CI,1,0,0
2284,[Movies],,DAYLIGHT'S END (BLU-RAY),Various,157834,B01GF9N7FE,0,0,0
2296,[Warner Home Video],,V for Vendetta,Natalie Portman,186012,B01GJQM5JM,1,0,0
2307,[All Lionsgate Titles],,A Hologram For The King Digital,Tom Hanks,16605,B01GP4HSH2,1,0,0


___

In [None]:
# COME BACK TO THIS IF YOU WANT TO GET MORE GRANULAR BUT I DON"T THINK IT'S NECESSARY
#df_genre = df_mod.explode('category').rename(columns={'category': 'genre'})
#df_genre.info()

___