### Data Pre-Processing

In [119]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data=pd.read_csv('movies.csv/movies.csv')
df=data.copy()

In [120]:
print("dimensions of the dataset:", df.shape)
print("columns in the dataset:", list(df.columns))

print(df.info())

dimensions of the dataset: (9999, 9)
columns in the dataset: ['MOVIES', 'YEAR', 'GENRE', 'RATING', 'ONE-LINE', 'STARS', 'VOTES', 'RunTime', 'Gross']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOVIES    9999 non-null   object 
 1   YEAR      9355 non-null   object 
 2   GENRE     9919 non-null   object 
 3   RATING    8179 non-null   float64
 4   ONE-LINE  9999 non-null   object 
 5   STARS     9999 non-null   object 
 6   VOTES     8179 non-null   object 
 7   RunTime   7041 non-null   float64
 8   Gross     460 non-null    object 
dtypes: float64(2), object(7)
memory usage: 703.2+ KB
None


In [121]:
df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [122]:
df.dtypes

MOVIES       object
YEAR         object
GENRE        object
RATING      float64
ONE-LINE     object
STARS        object
VOTES        object
RunTime     float64
Gross        object
dtype: object

### 1.Type Casting

some features are not in their original type. so let's make it right.

here date,votes are not strings, lets convert them into their original type

In [123]:
df['YEAR']=df['YEAR'].str.strip('()')
df['YEAR']=df['YEAR'].str.rstrip('')
df['YEAR']=df['YEAR'].str.replace(r'[\-‒–—―−]+$', '', regex=True)

df['YEAR']

0            2021
1          2021– 
2       2010–2022
3          2013– 
4            2021
          ...    
9994       2021– 
9995       2021– 
9996       2022– 
9997       2021– 
9998       2021– 
Name: YEAR, Length: 9999, dtype: object

In [124]:
df[['START_YEAR','END_YEAR']]=df['YEAR'].str.extract(r'(\d{4})(?:[-–](\d{4}))?')
df.drop('YEAR',axis=1,inplace=True)


In [125]:
df['VOTES']=pd.to_numeric(df['VOTES'],errors='coerce')

In [126]:
df.dtypes

MOVIES         object
GENRE          object
RATING        float64
ONE-LINE       object
STARS          object
VOTES         float64
RunTime       float64
Gross          object
START_YEAR     object
END_YEAR       object
dtype: object

In [127]:
df['Gross']=pd.to_numeric(df['Gross'],errors='coerce')
df['START_YEAR']=pd.to_datetime(df['START_YEAR'],format='%Y')
df['END_YEAR']=pd.to_datetime(df['END_YEAR'],format='%Y')
df.dtypes


MOVIES                object
GENRE                 object
RATING               float64
ONE-LINE              object
STARS                 object
VOTES                float64
RunTime              float64
Gross                float64
START_YEAR    datetime64[ns]
END_YEAR      datetime64[ns]
dtype: object

In [128]:
df.head()

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,,121.0,,2021-01-01,NaT
1,Masters of the Universe: Revelation,"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",,25.0,,2021-01-01,NaT
2,The Walking Dead,"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",,44.0,,2010-01-01,2022-01-01
3,Rick and Morty,"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",,23.0,,2013-01-01,NaT
4,Army of Thieves,"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,,2021-01-01,NaT


Done with Type casting and now let's clean the columns which have white spaces like'\n'

In [129]:
df['GENRE']=df['GENRE'].str.strip()
df['ONE-LINE']=df['ONE-LINE'].str.strip()
df['STARS']=df['STARS'].str.replace(r'\s+', '', regex=True)

df.head()

Unnamed: 0,MOVIES,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross,START_YEAR,END_YEAR
0,Blood Red Sky,"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,"Director:PeterThorwarth|Stars:PeriBaumeister,C...",,121.0,,2021-01-01,NaT
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,"Stars:ChrisWood,SarahMichelleGellar,LenaHeadey...",,25.0,,2021-01-01,NaT
2,The Walking Dead,"Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:AndrewLincoln,NormanReedus,MelissaMcBrid...",,44.0,,2010-01-01,2022-01-01
3,Rick and Morty,"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,"Stars:JustinRoiland,ChrisParnell,SpencerGramme...",,23.0,,2013-01-01,NaT
4,Army of Thieves,"Action, Crime, Horror",,"A prequel, set before the events of Army of th...",Director:MatthiasSchweighöfer|Stars:MatthiasSc...,,,,2021-01-01,NaT


### 2. Duplicate records handling

In [130]:
df.duplicated(keep='last').sum()


431

Should delete the duplicates

In [131]:
df.drop_duplicates(keep='first',inplace=True)
df.shape

(9568, 10)

### 3.Imputation

In [132]:
#spplit data set into numeric and numeric 

df_num=df.select_dtypes(include=['int64','float64'])
df_cat=df.select_dtypes(include='object')

In [133]:
df_num.describe()

Unnamed: 0,RATING,VOTES,RunTime,Gross
count,8168.0,4418.0,7008.0,0.0
mean,6.919699,292.167723,68.874144,
std,1.21997,271.386878,47.27889,
min,1.1,5.0,1.0,
25%,6.2,72.25,37.0,
50%,7.1,193.0,60.0,
75%,7.8,459.75,95.0,
max,9.9,999.0,853.0,


In [134]:
df_num.isnull().sum()

RATING     1400
VOTES      5150
RunTime    2560
Gross      9568
dtype: int64

In [135]:
df_num.drop('Gross',axis=1,inplace=True)


In [136]:
df_num['RATING'].median()

7.1

In [137]:
df_num.fillna(df_num.median(),inplace=True)

In [138]:
df_cat.mode()

Unnamed: 0,MOVIES,GENRE,ONE-LINE,STARS
0,Bleach: Burîchi,Comedy,Add a Plot,


In [139]:
df_cat['GENRE'].mode()

0    Comedy
Name: GENRE, dtype: object

In [140]:
df_cat=df_cat.fillna(df_cat.mode())

In [141]:
df_cat['GENRE'].value_counts()

Comedy                             837
Animation, Action, Adventure       663
Drama                              546
Documentary                        495
Crime, Drama, Mystery              328
                                  ... 
Documentary, Adventure, History      1
Comedy, Crime, Romance               1
Drama, Romance, Western              1
Comedy, Drama, Western               1
Crime, Mystery, Romance              1
Name: GENRE, Length: 510, dtype: int64

### 4. Outlier Treatment


In [142]:
def outlier(df):
    for i in list(df.columns):
        u=df[i].quantile(0.95)
        l=df[i].quantile(0.05)
        df[i]=np.where(df[i]<l,
                       l,
                       (np.where(
                           df[i]>u,
                           u,
                           df[i])))
    return df
        
            
            
        

In [143]:
outlier(df=df_num)

Unnamed: 0,RATING,VOTES,RunTime
0,6.1,193.0,120.0
1,5.0,193.0,25.0
2,8.2,193.0,44.0
3,8.6,193.0,23.0
4,7.1,193.0,60.0
...,...,...,...
9993,7.1,193.0,60.0
9995,7.1,193.0,60.0
9996,7.1,193.0,60.0
9997,7.1,193.0,60.0


In [144]:
df_num.boxplot(column='RunTime')

<Axes: >

see after the outlier treatment 

### 5. Zero Variance

In [145]:
df_num.var()

RATING         0.995498
VOTES      28527.865956
RunTime      784.119346
dtype: float64

If the variance is zero or nearer to zero then that feature is almost same for every record so we can eliminate to reduce the compute cost. here none is nearer to zero nor zero. so we keep all features according to this variance factor.

### 6. Descretization

Data discretization, also known as binning, is the process of grouping continuous values of variables into contiguous intervals. This procedure transforms continuous variables into discrete variables, and it is commonly used in data mining and data science.

here in this case, there is no purpose to do that so let's skip it.

### 7. Feature Encoding

In [146]:
df_cat

Unnamed: 0,MOVIES,GENRE,ONE-LINE,STARS
0,Blood Red Sky,"Action, Horror, Thriller",A woman with a mysterious illness is forced in...,"Director:PeterThorwarth|Stars:PeriBaumeister,C..."
1,Masters of the Universe: Revelation,"Animation, Action, Adventure",The war for Eternia begins again in what may b...,"Stars:ChrisWood,SarahMichelleGellar,LenaHeadey..."
2,The Walking Dead,"Drama, Horror, Thriller",Sheriff Deputy Rick Grimes wakes up from a com...,"Stars:AndrewLincoln,NormanReedus,MelissaMcBrid..."
3,Rick and Morty,"Animation, Adventure, Comedy",An animated series that follows the exploits o...,"Stars:JustinRoiland,ChrisParnell,SpencerGramme..."
4,Army of Thieves,"Action, Crime, Horror","A prequel, set before the events of Army of th...",Director:MatthiasSchweighöfer|Stars:MatthiasSc...
...,...,...,...,...
9993,Totenfrau,"Drama, Thriller",Add a Plot,"Director:NicolaiRohde|Stars:FelixKlare,RominaK..."
9995,Arcane,"Animation, Action, Adventure",Add a Plot,
9996,Heart of Invictus,"Documentary, Sport",Add a Plot,Director:OrlandovonEinsiedel|Star:PrinceHarry
9997,The Imperfects,"Adventure, Drama, Fantasy",Add a Plot,Director:JovankaVuckovic|Stars:MorganTaylorCam...


here in this case genre is not affective in modeling, so we can better eliminate it.

In [147]:
df_cat.drop('MOVIES',axis=1,inplace=True)

In [148]:
df_cat['STARS']


0       Director:PeterThorwarth|Stars:PeriBaumeister,C...
1       Stars:ChrisWood,SarahMichelleGellar,LenaHeadey...
2       Stars:AndrewLincoln,NormanReedus,MelissaMcBrid...
3       Stars:JustinRoiland,ChrisParnell,SpencerGramme...
4       Director:MatthiasSchweighöfer|Stars:MatthiasSc...
                              ...                        
9993    Director:NicolaiRohde|Stars:FelixKlare,RominaK...
9995                                                     
9996        Director:OrlandovonEinsiedel|Star:PrinceHarry
9997    Director:JovankaVuckovic|Stars:MorganTaylorCam...
9998    Director:JovankaVuckovic|Stars:MorganTaylorCam...
Name: STARS, Length: 9568, dtype: object

In [149]:
df_cat['Director']=df_cat['STARS'].str.extract('Director:(.*?)\|')
df_cat['Stars']=df_cat['STARS'].str.extract('Stars:(.*)')

In [150]:
df_cat.drop('STARS',axis=1,inplace=True)

In [151]:
df_cat.head()

Unnamed: 0,GENRE,ONE-LINE,Director,Stars
0,"Action, Horror, Thriller",A woman with a mysterious illness is forced in...,PeterThorwarth,"PeriBaumeister,CarlAntonKoch,AlexanderScheer,K..."
1,"Animation, Action, Adventure",The war for Eternia begins again in what may b...,,"ChrisWood,SarahMichelleGellar,LenaHeadey,MarkH..."
2,"Drama, Horror, Thriller",Sheriff Deputy Rick Grimes wakes up from a com...,,"AndrewLincoln,NormanReedus,MelissaMcBride,Laur..."
3,"Animation, Adventure, Comedy",An animated series that follows the exploits o...,,"JustinRoiland,ChrisParnell,SpencerGrammer,Sara..."
4,"Action, Crime, Horror","A prequel, set before the events of Army of th...",MatthiasSchweighöfer,"MatthiasSchweighöfer,NathalieEmmanuel,RubyO.Fe..."


In [152]:
df_cat.isnull().sum()

GENRE         78
ONE-LINE       0
Director    4543
Stars       1203
dtype: int64

In [153]:
df_cat['GENRE']=df_cat['GENRE'].fillna('').str.strip()
df_cat['Director']=df_cat['Director'].fillna('').str.strip()
df_cat['Stars']=df_cat['Stars'].fillna('').str.strip()

In [154]:
df_cat.isnull().sum()

GENRE       0
ONE-LINE    0
Director    0
Stars       0
dtype: int64

In [155]:
df_cat['GENRE']=df_cat['GENRE'].apply(lambda s: [t.strip() for t in s.split(',')])


In [160]:
df_cat['Director']=df_cat['Director'].apply(lambda s: {t.strip() for t in s.split(',')})


AttributeError: 'set' object has no attribute 'split'

In [157]:
df_cat['Stars']=df_cat['Stars'].apply(lambda s: {t.strip() for t in s.split(',')})
df_cat

Unnamed: 0,GENRE,ONE-LINE,Director,Stars
0,"[Action, Horror, Thriller]",A woman with a mysterious illness is forced in...,{PeterThorwarth},"{KaisSetti, CarlAntonKoch, PeriBaumeister, Ale..."
1,"[Animation, Action, Adventure]",The war for Eternia begins again in what may b...,{},"{LenaHeadey, SarahMichelleGellar, MarkHamill, ..."
2,"[Drama, Horror, Thriller]",Sheriff Deputy Rick Grimes wakes up from a com...,{},"{MelissaMcBride, LaurenCohan, NormanReedus, An..."
3,"[Animation, Adventure, Comedy]",An animated series that follows the exploits o...,{},"{SarahChalke, ChrisParnell, SpencerGrammer, Ju..."
4,"[Action, Crime, Horror]","A prequel, set before the events of Army of th...",{MatthiasSchweighöfer},"{MatthiasSchweighöfer, NathalieEmmanuel, RubyO..."
...,...,...,...,...
9993,"[Drama, Thriller]",Add a Plot,{NicolaiRohde},"{FelixKlare, RobertPalfrader, AnnaMariaMühe, R..."
9995,"[Animation, Action, Adventure]",Add a Plot,{},{}
9996,"[Documentary, Sport]",Add a Plot,{OrlandovonEinsiedel},{}
9997,"[Adventure, Drama, Fantasy]",Add a Plot,{JovankaVuckovic},"{RhiannaJagpal, IñakiGodoy, SiddharthaMinhas, ..."


In [162]:
df_cat

Unnamed: 0,GENRE,ONE-LINE,Director,Stars
0,"[Action, Horror, Thriller]",A woman with a mysterious illness is forced in...,{PeterThorwarth},"{KaisSetti, CarlAntonKoch, PeriBaumeister, Ale..."
1,"[Animation, Action, Adventure]",The war for Eternia begins again in what may b...,{},"{LenaHeadey, SarahMichelleGellar, MarkHamill, ..."
2,"[Drama, Horror, Thriller]",Sheriff Deputy Rick Grimes wakes up from a com...,{},"{MelissaMcBride, LaurenCohan, NormanReedus, An..."
3,"[Animation, Adventure, Comedy]",An animated series that follows the exploits o...,{},"{SarahChalke, ChrisParnell, SpencerGrammer, Ju..."
4,"[Action, Crime, Horror]","A prequel, set before the events of Army of th...",{MatthiasSchweighöfer},"{MatthiasSchweighöfer, NathalieEmmanuel, RubyO..."
...,...,...,...,...
9993,"[Drama, Thriller]",Add a Plot,{NicolaiRohde},"{FelixKlare, RobertPalfrader, AnnaMariaMühe, R..."
9995,"[Animation, Action, Adventure]",Add a Plot,{},{}
9996,"[Documentary, Sport]",Add a Plot,{OrlandovonEinsiedel},{}
9997,"[Adventure, Drama, Fantasy]",Add a Plot,{JovankaVuckovic},"{RhiannaJagpal, IñakiGodoy, SiddharthaMinhas, ..."


In [163]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genre_feats = mlb.fit_transform(df_cat['GENRE'])
genre_feature_names = ['GENRE_'+g for g in mlb.classes_]
genre_df = pd.DataFrame(genre_feats, columns=genre_feature_names, index=df_cat.index)


In [184]:
genre_df

Unnamed: 0,GENRE_,GENRE_Action,GENRE_Adventure,GENRE_Animation,GENRE_Biography,GENRE_Comedy,GENRE_Crime,GENRE_Documentary,GENRE_Drama,GENRE_Family,...,GENRE_News,GENRE_Reality-TV,GENRE_Romance,GENRE_Sci-Fi,GENRE_Short,GENRE_Sport,GENRE_Talk-Show,GENRE_Thriller,GENRE_War,GENRE_Western
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9995,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
9997,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [164]:
from sklearn.preprocessing import MultiLabelBinarizer

MB=MultiLabelBinarizer()
director_feats=MB.fit_transform(df_cat['Director'])
director_feats_names=['Director_'+t for t in MB.classes_]
dfirector_df=pd.DataFrame(director_feats,columns=director_feats_names,index=df_cat.index)
dfirector_df

Unnamed: 0,Director_,Director_AadishKeluskar,Director_AaronAugenblick,Director_AaronBurns,Director_AaronLieber,Director_AaronLong,Director_AaronSorkin,Director_AbdelhamidBouchnak,Director_AbhijitPanse,Director_AbhinayDeo,...,Director_ZsoltPálfi,Director_ZulkarnainAzhar,Director_ÁlexdelaIglesia,Director_ÁlvaroBrechner,Director_ÁlvaroFernándezArmero,Director_ÁngelGómezHernández,Director_ÁngelesReiné,Director_ÅkeSandgren,Director_ÓscarPedraza,Director_ÖmerUgur
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9995,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
from sklearn.preprocessing import MultiLabelBinarizer

MB=MultiLabelBinarizer()
Stars_feats=MB.fit_transform(df_cat['Stars'])
Starts_feats_names=['Star_'+t for t in MB.classes_]
Stars_df=pd.DataFrame(Stars_feats,columns=Starts_feats_names,index=df_cat.index)
Stars_df

Unnamed: 0,Star_,Star_2'LiveBre,Star_2Chainz,Star_2Mex,Star_50Cent,Star_A.J.Baime,Star_A.J.Daulerio,Star_A.J.LoCascio,Star_A.N.T.I.,Star_ABoogiewitdaHoodie,...,Star_ÖzgeBorak,Star_ÖzgeÖzpirinçci,Star_ÖzgürEmreYildirim,Star_ÖzgürOzan,Star_ÖzkanUgur,Star_ÖzzNûjen,Star_ÚrsulaCorberó,Star_ÚrsulaPruneda,Star_ÜlküDuru,Star_ÞorsteinnBachmann
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9995,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [166]:
from sklearn.decomposition import TruncatedSVD

ts=TruncatedSVD(n_components=50)
directors_final=ts.fit_transform(dfirector_df)
directors_final=pd.DataFrame(directors_final)



In [214]:
directors_final.columns=directors_final.columns.astype('str')
directors_final.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48',
       '49'],
      dtype='object')

In [215]:
from sklearn.decomposition import TruncatedSVD

tsv=TruncatedSVD(n_components=50)
stars_final=tsv.fit_transform(Stars_df)
stars_final=pd.DataFrame(stars_final)
stars_final.columns=stars_final.columns.astype('str')



In [171]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
x=tf.fit_transform(df_cat['ONE-LINE'])

In [None]:
oneline_df=pd.DataFrame(x.toarray(),columns=tf.get_feature_names_out())


'coastal'

In [173]:
from sklearn.decomposition import TruncatedSVD

tsvo=TruncatedSVD(n_components=50)
one_line=tsvo.fit_transform(oneline_df)
oneline_df_final=pd.DataFrame(one_line)
oneline_df_final


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.000134,0.122309,0.044257,-0.009415,0.001350,0.008867,-0.030256,0.095940,-0.048968,-0.013427,...,-0.009668,-0.004068,-0.010394,0.003189,0.011380,0.057383,0.012156,0.080962,-0.009445,-0.032020
1,0.000167,0.180673,-0.035150,0.041508,-0.013829,-0.000181,-0.007393,-0.032919,0.097502,0.049315,...,-0.081211,0.100061,0.013263,-0.071322,0.012164,-0.008830,-0.025017,-0.042778,0.031455,-0.020281
2,0.000185,0.178081,-0.014276,0.007586,0.038431,0.000599,-0.041744,0.011991,0.056780,-0.074387,...,-0.023215,-0.016455,-0.038824,0.068950,0.044964,-0.016650,0.046170,0.012192,0.052725,0.037394
3,0.000187,0.157079,-0.048229,0.052748,-0.034824,-0.042237,0.038392,0.030356,-0.024613,0.064147,...,-0.024922,0.066593,0.017825,-0.071090,-0.024666,-0.051029,-0.015186,-0.027923,0.077967,-0.026163
4,0.000246,0.189325,-0.180452,-0.052325,-0.047123,-0.076568,-0.052913,0.041091,-0.070692,0.017261,...,-0.009447,0.014709,-0.039494,0.035182,0.035217,0.011850,0.027109,0.044300,0.043675,0.026046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9563,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298
9564,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298
9565,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298
9566,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298


In [217]:
oneline_df_final.columns=oneline_df_final.columns.astype('str')

In [218]:
oneline_df_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.000134,0.122309,0.044257,-0.009415,0.001350,0.008867,-0.030256,0.095940,-0.048968,-0.013427,...,-0.009668,-0.004068,-0.010394,0.003189,0.011380,0.057383,0.012156,0.080962,-0.009445,-0.032020
1,0.000167,0.180673,-0.035150,0.041508,-0.013829,-0.000181,-0.007393,-0.032919,0.097502,0.049315,...,-0.081211,0.100061,0.013263,-0.071322,0.012164,-0.008830,-0.025017,-0.042778,0.031455,-0.020281
2,0.000185,0.178081,-0.014276,0.007586,0.038431,0.000599,-0.041744,0.011991,0.056780,-0.074387,...,-0.023215,-0.016455,-0.038824,0.068950,0.044964,-0.016650,0.046170,0.012192,0.052725,0.037394
3,0.000187,0.157079,-0.048229,0.052748,-0.034824,-0.042237,0.038392,0.030356,-0.024613,0.064147,...,-0.024922,0.066593,0.017825,-0.071090,-0.024666,-0.051029,-0.015186,-0.027923,0.077967,-0.026163
4,0.000246,0.189325,-0.180452,-0.052325,-0.047123,-0.076568,-0.052913,0.041091,-0.070692,0.017261,...,-0.009447,0.014709,-0.039494,0.035182,0.035217,0.011850,0.027109,0.044300,0.043675,0.026046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9563,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298
9564,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298
9565,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298
9566,0.999979,-0.001097,0.000163,0.000096,0.000043,-0.000155,0.000272,-0.000129,-0.000063,-0.000300,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298


In [219]:
df_cat_final=pd.concat([genre_df,directors_final,stars_final,oneline_df_final],axis=1)

In [220]:
df_cat_final

Unnamed: 0,GENRE_,GENRE_Action,GENRE_Adventure,GENRE_Animation,GENRE_Biography,GENRE_Comedy,GENRE_Crime,GENRE_Documentary,GENRE_Drama,GENRE_Family,...,40,41,42,43,44,45,46,47,48,49
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.009668,-0.004068,-0.010394,0.003189,0.011380,0.057383,0.012156,0.080962,-0.009445,-0.032020
1,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.081211,0.100061,0.013263,-0.071322,0.012164,-0.008830,-0.025017,-0.042778,0.031455,-0.020281
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.023215,-0.016455,-0.038824,0.068950,0.044964,-0.016650,0.046170,0.012192,0.052725,0.037394
3,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.024922,0.066593,0.017825,-0.071090,-0.024666,-0.051029,-0.015186,-0.027923,0.077967,-0.026163
4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.009447,0.014709,-0.039494,0.035182,0.035217,0.011850,0.027109,0.044300,0.043675,0.026046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9540,,,,,,,,,,,...,-0.020502,0.002191,0.056205,-0.005734,-0.035848,0.005360,-0.010647,-0.001396,-0.006479,-0.033228
9542,,,,,,,,,,,...,0.009595,0.050737,-0.027713,0.020457,-0.011576,-0.026457,0.044554,0.002011,-0.022842,-0.020609
9565,,,,,,,,,,,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298
9566,,,,,,,,,,,...,-0.000540,-0.001212,-0.001494,-0.000920,-0.001577,0.000337,0.000658,-0.000525,-0.000385,-0.000298


In [221]:
df_all=pd.concat([df_num.reset_index(drop=True),df_cat_final],axis=1)
df_all

Unnamed: 0,RATING,VOTES,RunTime,GENRE_,GENRE_Action,GENRE_Adventure,GENRE_Animation,GENRE_Biography,GENRE_Comedy,GENRE_Crime,...,40,41,42,43,44,45,46,47,48,49
0,6.1,193.0,120.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.009668,-0.004068,-0.010394,0.003189,0.011380,0.057383,0.012156,0.080962,-0.009445,-0.032020
1,5.0,193.0,25.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,-0.081211,0.100061,0.013263,-0.071322,0.012164,-0.008830,-0.025017,-0.042778,0.031455,-0.020281
2,8.2,193.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.023215,-0.016455,-0.038824,0.068950,0.044964,-0.016650,0.046170,0.012192,0.052725,0.037394
3,8.6,193.0,23.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,-0.024922,0.066593,0.017825,-0.071090,-0.024666,-0.051029,-0.015186,-0.027923,0.077967,-0.026163
4,7.1,193.0,60.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-0.009447,0.014709,-0.039494,0.035182,0.035217,0.011850,0.027109,0.044300,0.043675,0.026046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
9995,,,,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,,,,,,,,,,
9996,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
9997,,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [222]:
df_all.fillna(value=0,inplace=True)

### Feature Transformation

Feature transformation is the process of converting existing features in a dataset into a new form to improve the performance, interpretability, and efficiency of a machine learning model.

ex:
1. log transformation(widely used)

2. root transformation

3. square transformation

### 9. Feature Scaling (Final step)

In [224]:
from sklearn.preprocessing import MinMaxScaler
mx=MinMaxScaler()

scaled=mx.fit_transform(df_all)

df_final=pd.DataFrame(scaled)

In [225]:
df_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,171,172,173,174,175,176,177,178,179,180
0,0.709302,0.266207,1.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.373338,0.390567,0.340548,0.434701,0.317507,0.554906,0.523173,0.706661,0.380693,0.346968
1,0.581395,0.266207,0.208333,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.181915,0.584773,0.378973,0.251863,0.318882,0.383991,0.436957,0.305254,0.490157,0.387699
2,0.953488,0.266207,0.366667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.337092,0.367465,0.294371,0.596068,0.376369,0.363805,0.602063,0.483575,0.547085,0.587807
3,1.000000,0.266207,0.191667,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.332524,0.522353,0.386384,0.252432,0.254329,0.275060,0.459759,0.353443,0.614642,0.367289
4,0.825581,0.266207,0.500000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.373928,0.425587,0.293283,0.513206,0.359286,0.437372,0.557854,0.587731,0.522863,0.548434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9862,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.399206,0.398154,0.357431,0.426876,0.297561,0.406783,0.494980,0.444025,0.405972,0.458065
9863,0.000000,0.000000,0.000000,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.399206,0.398154,0.357431,0.426876,0.297561,0.406783,0.494980,0.444025,0.405972,0.458065
9864,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.399206,0.398154,0.357431,0.426876,0.297561,0.406783,0.494980,0.444025,0.405972,0.458065
9865,0.000000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.399206,0.398154,0.357431,0.426876,0.297561,0.406783,0.494980,0.444025,0.405972,0.458065


# ----------------------The End-------------------------