### load Data	

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import re
import time
import datetime
from collections import defaultdict
import ast

In [None]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

In [None]:
train_data.shape,test_data.shape

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
per_train_data=pd.DataFrame(train_data.isnull().sum()/len(train_data)*100)

In [None]:
per_train_data

In [None]:
train_data['imdb_id']

In [None]:
train_data['original_language']

In [None]:
 train_data['poster_path']

In [None]:
train_data['overview']

In [None]:
train_data['production_companies']

In [None]:
 train_data['production_countries']

In [None]:
 train_data['tagline']

In [None]:
 train_data['title']

### data filter

In [None]:
train_data.drop(columns=['belongs_to_collection','homepage'],axis = 1 ,inplace=True)
train_data.drop(columns=['imdb_id', 'poster_path','tagline', 'overview', 'original_title','Keywords' ,'crew'],axis = 1 ,inplace=True)

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
train_data.describe()

### data processing

### budget vs renveue

In [None]:
train_data[['id','title','budget','revenue']].sort_values(['revenue'], ascending=False).head(10).style.background_gradient(subset='revenue', cmap='BuGn')

In [None]:
fig, ax  = plt.subplots(figsize = (10, 6))
ax.scatter(train_data['budget'], train_data['revenue'],alpha=0.5)
ax.set_xlabel('Budget')
ax.set_ylabel('Revenue')

  At the first glance, budget seems to have a relatively high positive correlation with revenue.

### genres VS revenue	

In [None]:
train_data['genres'][0]

In [None]:
# Extract genres
genres=train_data.loc[:,["genres"]]
genres["genres"]=train_data['genres'].fillna("None")
genres["genres"].head(5)

In [None]:
def extract_genres(row):
    if row == "None":
        return ['None']
    else:
        results = re.findall(r"'name': '(\w+\s?\w+)'", row)
        return results
    
genres["genres"] = genres["genres"].apply(extract_genres)

In [None]:
genres_dict = dict()
for genre in genres["genres"]:
    for elem in genre:
        if elem not in genres_dict:
            genres_dict[elem] = 1
        else:
            genres_dict[elem] += 1
genres_df = pd.DataFrame.from_dict(genres_dict, orient='index')
genres_df.columns=['Movie Numbers']
genres_df = genres_df.sort_values(by='Movie Numbers',ascending=False)

In [None]:
genres_df.plot.bar()

### popularity VS revenue

In [None]:
fig, ax  = plt.subplots(figsize = (10, 6))
ax.scatter(train_data['popularity'], train_data['revenue'],alpha=0.5)
ax.set_xlabel('Budget')
ax.set_ylabel('Revenue')

In [None]:
plt.figure(figsize=(12,8))
edgecolor=(0,0,0),
sns.histplot(train_data['popularity'].T, kde=False)
plt.title("Movie Popularity Count",fontsize=20)
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.xticks(fontsize=12)
plt.show()

### release_date vs revenue

In [None]:
train_data['release_date']

In [None]:
Extract release_date
train_data[['release_month', 'release_day', 'release_year']] = train_data['release_date'].str.split('/', expand = True).replace(np.nan, -1)

In [None]:
train_data['release_month'] = train_data['release_month'].astype(int)
train_data['release_day'] = train_data['release_day'].astype(int)
train_data['release_year'] = train_data['release_year'].astype(int)
train_data['Day_of_Week'] = (movietime["Day_of_Week"].dt.dayofweek)
train_data.loc[(train_data['release_year'] <= 19) & (train_data['release_year'] < 100), "release_year"] += 2000
train_data.loc[(train_data['release_year'] > 19)  & (train_data['release_year'] < 100), "release_year"] += 1900   

In [None]:
indices = list([x - 1 for x in train_data.release_month.value_counts().index])
indices

In [None]:
train_data.release_month.value_counts().sort_index()

In [None]:
train_data

### Year

In [None]:
plt.figure(figsize=(20,12))
edgecolor=(0,0,0),
sns.countplot(train_data['release_year'].sort_values(), edgecolor=(0,0,0))
plt.title("Year",fontsize=20)
plt.xlabel('Year')
plt.ylabel('Number of Movies Release')
plt.xticks(fontsize=12,rotation=90)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax.scatter(train_data['release_year'][:3000], train_data['revenue'][:3000])
revenue = train_data.groupby('release_year')["revenue"].aggregate('mean')
plt.xlabel("Year of release")
plt.ylabel("Average revenue")
plt.show()

In [None]:
year = train_data['release_year']
revenue = train_data.groupby('release_year')["revenue"].aggregate('mean')
fig, ax = plt.subplots(figsize=(15,8))
ax.plot(revenue.index,revenue)
plt.xlabel("Year of release")
plt.ylabel("Average revenue")
plt.show()

### Month

In [None]:
plt.figure(figsize=(20,12))
edgecolor=(0,0,0),
sns.countplot(train_data['release_month'].sort_values(), edgecolor=(0,0,0))
plt.title("Month",fontsize=20)
plt.xlabel('Month')
plt.ylabel('Number of Movies Release')
plt.xticks(fontsize=15)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,8))
ax.bar(train_data['release_month'][:3000], train_data['revenue'][:3000], alpha=0.5)
plt.title("Month",fontsize=20)
plt.xlabel('Month')
plt.ylabel('revenue')
plt.xticks(fontsize=15)

### Week


In [None]:
train_data[['release_month','release_day','release_year']]

### runtime

In [None]:
plt.hist(train_data['runtime'].fillna(0) / 60, bins=40);
plt.title('Distribution of length of film in hours', fontsize=12);
plt.xlabel('Duration of Movie in Hours')
plt.ylabel('Number of Movies')

In [None]:
train_data[['id','title','runtime', 'budget', 'revenue']].sort_values(['runtime'],ascending=False).head(10).style.background_gradient(subset=['runtime','budget','revenue'], cmap='YlGn')