In [62]:
import pandas as pd
import numpy as np

In [63]:
df=pd.read_csv('../data/raw/imdb_top_1000.csv')

In [64]:
df.head()
df.shape
df.info()
df.isnull().sum()
df.duplicated().any()

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   str    
 1   Series_Title   1000 non-null   str    
 2   Released_Year  1000 non-null   str    
 3   Certificate    899 non-null    str    
 4   Runtime        1000 non-null   str    
 5   Genre          1000 non-null   str    
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   str    
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   str    
 10  Star1          1000 non-null   str    
 11  Star2          1000 non-null   str    
 12  Star3          1000 non-null   str    
 13  Star4          1000 non-null   str    
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    str    
dtypes: float64(2), int64(1), str(13)
memory usage: 125.1 KB


np.False_

In [65]:
df = df.drop(columns=[
    "Poster_Link",
    "Overview",
    "Certificate",
    "Star2",
    "Star3",
    "Star4"
])

## Initial Data Inspection 

### Dataest Shape
the dataset contains **1000 rows** and **10 colunms**.

### Missing Values
- `Meta_score` has missing values
- `Gross` has missing values

### Data Type Issues
- `Released_Year` should be numeric but contains text values
- `Gross` should be numeric but contains non-numeric values

### Low vote Movies
Some movies have very low number of votes and may not be reliable for analysis.

### Summary
The dataset requires data cleaning including:
- Handling missing values
- Fixing data types
- filterinf low-vote movies

In [66]:
df.columns=df.columns.str.lower().str.strip()

In [67]:
df['released_year']=pd.to_numeric(df['released_year'],errors='coerce')

df=df.dropna(subset=['released_year'])
df['released_year']=df['released_year'].astype(int)

In [68]:
df['runtime']=df['runtime'].str.replace(' min','',regex=False)
df['runtime']=pd.to_numeric(df['runtime'],errors='coerce')

In [69]:
df['gross']=df['gross'].str.replace(',','',regex=False)
df['gross']=pd.to_numeric(df['gross'],errors='coerce')

In [70]:
df['genre']=df['genre'].str.lower().str.strip()

In [71]:
df['meta_score']=df['meta_score'].fillna(df['meta_score'].median())
df['gross']=df['gross'].fillna(df['gross'].median())

In [72]:
df.isnull().sum()
df.info()
df.describe()

<class 'pandas.DataFrame'>
Index: 999 entries, 0 to 999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   series_title   999 non-null    str    
 1   released_year  999 non-null    int64  
 2   runtime        999 non-null    int64  
 3   genre          999 non-null    str    
 4   imdb_rating    999 non-null    float64
 5   meta_score     999 non-null    float64
 6   director       999 non-null    str    
 7   star1          999 non-null    str    
 8   no_of_votes    999 non-null    int64  
 9   gross          999 non-null    float64
dtypes: float64(3), int64(3), str(4)
memory usage: 85.9 KB


Unnamed: 0,released_year,runtime,imdb_rating,meta_score,no_of_votes,gross
count,999.0,999.0,999.0,999.0,999.0,999.0
mean,1991.217217,122.873874,7.94965,78.134134,273697.4,60387740.0
std,23.297025,28.10252,0.275407,11.373863,327536.6,101411000.0
min,1920.0,45.0,7.6,28.0,25088.0,1305.0
25%,1976.0,103.0,7.7,72.0,55471.5,5011838.0
50%,1999.0,119.0,7.9,79.0,138356.0,23457440.0
75%,2009.0,137.0,8.1,85.5,374477.5,61390040.0
max,2020.0,321.0,9.3,100.0,2343110.0,936662200.0


In [73]:
df['decade']=(df['released_year']//10)*10

df['released_period']=pd.cut(
    df['released_year'],
    bins=[1900,1980,2000,2026],
    labels=['old','middle','new']
)

In [81]:
df['rating_category']=pd.cut(
    df['imdb_rating'],
    bins=[0,5,7,8.5,10],
    labels=['low','average','high','excellent']
)

df['gross_category']=pd.cut(
    df['gross'],
    bins=[0,50000000,200000000,1000000000],
    labels=['low','medium','high']
)

df['vote_per_million_gross'] = df['no_of_votes'] / (df['gross'] / 1_000_000)
df['vote_per_million_gross'] = df['vote_per_million_gross'].replace([np.inf, -np.inf], np.nan)

In [75]:
df['main_genre']=df['genre'].str.split(',').str[0]
df['genre_count']=df['genre'].str.count(',')+1

In [77]:
df.shape

(999, 17)

In [78]:
df.to_csv("../data/processed/imdb_clean.csv", index=False)