In [19]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from IPython.display import display, Markdown
import re

In [20]:
train_data_df         = pd.read_csv('/home/lenny/Python-Notebooks/Movie-Genre-Classification/train_data.txt', sep=" ::: ", engine='python'
  , header=None, names=["ID", "Title", "Genre", "Description"])
test_data_df          = pd.read_csv('/home/lenny/Python-Notebooks/Movie-Genre-Classification/test_data.txt', sep=" ::: ", engine='python'
  , header=None, names=["ID", "Title", "Description"])
test_data_solution_df = pd.read_csv('/home/lenny/Python-Notebooks/Movie-Genre-Classification/test_data_solution.txt', sep=" ::: ", engine='python'
  , header=None, names=["ID", "Genre", "Description"])

def Explore(title_, dataframe_):
    display(Markdown(title_))
    display(Markdown('---'))
    display(dataframe_.head())
    display('Descriptive Statistics')
    display(dataframe_.describe())
    display('Missing Values')
    display(dataframe_.isnull().sum().sort_values(ascending=False)) 
    display('Column Data Types')
    display(dataframe_.dtypes) 
  
Explore('### Train Data', train_data_df) 
Explore('### Test Data', test_data_df)   
Explore('### Test Data Solutions', test_data_solution_df)  

### Train Data

---

Unnamed: 0,ID,Title,Genre,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


'Descriptive Statistics'

Unnamed: 0,ID
count,54214.0
mean,27107.5
std,15650.378084
min,1.0
25%,13554.25
50%,27107.5
75%,40660.75
max,54214.0


'Missing Values'

ID             0
Title          0
Genre          0
Description    0
dtype: int64

'Column Data Types'

ID              int64
Title          object
Genre          object
Description    object
dtype: object

### Test Data

---

Unnamed: 0,ID,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),Before he was known internationally as a marti...


'Descriptive Statistics'

Unnamed: 0,ID
count,54200.0
mean,27100.5
std,15646.336632
min,1.0
25%,13550.75
50%,27100.5
75%,40650.25
max,54200.0


'Missing Values'

ID             0
Title          0
Description    0
dtype: int64

'Column Data Types'

ID              int64
Title          object
Description    object
dtype: object

### Test Data Solutions

---

Unnamed: 0,ID,Genre,Description
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apart..."
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty chi..."
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family o...
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with his..."
5,Er nu zhai (1955),drama,Before he was known internationally as a marti...


'Descriptive Statistics'

Unnamed: 0,ID,Genre,Description
count,54200,54200,54200
unique,54200,27,54072
top,Curitiba Zero Grau (2010),drama,Grammy - music award of the American academy o...
freq,1,13612,10


'Missing Values'

ID             0
Genre          0
Description    0
dtype: int64

'Column Data Types'

ID             object
Genre          object
Description    object
dtype: object

--- 
# Data Preprocessing

In [24]:
# Convert training data samples to lowercase.
# 'apply' applies the function to each column rather than individual cells.
train_data_df[['Title', 'Genre', 'Description']] = train_data_df[['Title', 'Genre', 'Description']].apply(lambda x: x.str.lower())

# Remove character that aren't letter a-zA-Z, white spaces in the case of genre and description and for title we also preserve numbers to account for release dates which could be valuable information.
# 'map' applies the function to individual cells.
train_data_df[['Genre', 'Description']] = train_data_df[['Genre', 'Description']].map(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
train_data_df[['Title']] = train_data_df[['Title']].map(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
train_data_df.head()

Unnamed: 0,ID,Title,Genre,Description
0,1,oscar et la dame rose 2009,drama,listening in to a conversation between his doc...
1,2,cupid 1997,thriller,a brother and sister with a past incestuous re...
2,3,young wild and wonderful 1980,adult,as the bus empties the students for their fiel...
3,4,the secret sin 1915,drama,to help their unemployed father make ends meet...
4,5,the unrecovered 2007,drama,the films title refers not only to the unrecov...
