In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('../data/raw/imdb_top_1000.csv')

In [3]:
df.head()
df.shape
df.info()
df.isnull().sum()
df.duplicated().any()

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   str    
 1   Series_Title   1000 non-null   str    
 2   Released_Year  1000 non-null   str    
 3   Certificate    899 non-null    str    
 4   Runtime        1000 non-null   str    
 5   Genre          1000 non-null   str    
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   str    
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   str    
 10  Star1          1000 non-null   str    
 11  Star2          1000 non-null   str    
 12  Star3          1000 non-null   str    
 13  Star4          1000 non-null   str    
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    str    
dtypes: float64(2), int64(1), str(13)
memory usage: 125.1 KB


np.False_

In [4]:
df = df.drop(columns=[
    "Poster_Link",
    "Overview",
    "Certificate",
    "Star2",
    "Star3",
    "Star4"
])

## Initial Data Inspection 

### Dataest Shape
the dataset contains **1000 rows** and **10 colunms**.

### Missing Values
- `Meta_score` has missing values
- `Gross` has missing values

### Data Type Issues
- `Released_Year` should be numeric but contains text values
- `Gross` should be numeric but contains non-numeric values

### Low vote Movies
Some movies have very low number of votes and may not be reliable for analysis.

### Summary
The dataset requires data cleaning including:
- Handling missing values
- Fixing data types
- filterinf low-vote movies

In [5]:
df.columns=df.columns.str.lower().str.strip()

In [6]:
df['released_year']=pd.to_numeric(df['released_year'],errors='coerce')
df['released_year']=df['released_year'].fillna(0).astype(int)

In [7]:
df['runtime']=df['runtime'].str.replace(' min','')
df['runtime']=pd.to_numeric(df['runtime'],errors='coerce')

In [8]:
df['gross']=df['gross'].str.replace(',','')
df['gross']=pd.to_numeric(df['gross'],errors='coerce')

In [9]:
df['genre']=df['genre'].str.lower().str.strip()

In [10]:
df=df.dropna(subset=['released_year','imdb_rating','genre'])

In [11]:
df['meta_score']=df['meta_score'].fillna(df['meta_score'].median())
df['gross']=df['gross'].fillna(df['gross'].median())

In [12]:
df.isnull().sum()
df.info()
df.describe()

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   series_title   1000 non-null   str    
 1   released_year  1000 non-null   int64  
 2   runtime        1000 non-null   int64  
 3   genre          1000 non-null   str    
 4   imdb_rating    1000 non-null   float64
 5   meta_score     1000 non-null   float64
 6   director       1000 non-null   str    
 7   star1          1000 non-null   str    
 8   no_of_votes    1000 non-null   int64  
 9   gross          1000 non-null   float64
dtypes: float64(3), int64(3), str(4)
memory usage: 78.3 KB


Unnamed: 0,released_year,runtime,imdb_rating,meta_score,no_of_votes,gross
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1989.226,122.891,7.9493,78.133,273692.9,60513600.0
std,67.135341,28.093671,0.275491,11.368225,327372.7,101419200.0
min,0.0,45.0,7.6,28.0,25088.0,1305.0
25%,1976.0,103.0,7.7,72.0,55526.25,5012919.0
50%,1999.0,119.0,7.9,79.0,138548.5,23530890.0
75%,2009.0,137.0,8.1,85.25,374161.2,61539890.0
max,2020.0,321.0,9.3,100.0,2343110.0,936662200.0


In [13]:
df['decade']=(df['released_year']//10)*10

df['released_period']=pd.cut(
    df['released_year'],
    bins=[1900,1980,2000,2026],
    labels=['old','middle','new']
)

In [14]:
df['rating_category']=pd.cut(
    df['imdb_rating'],
    bins=[0,5,7,8.5,10],
    labels=['low','average','high','excellent']
)

df['gross category']=pd.cut(
    df['gross'],
    bins=[0,50000000,200000000,1000000000],
    labels=['low','medium','high']
)

df['vote_pre_millon_gross']=df['no_of_votes'] / (df['gross']/1_000_1000)

In [15]:
df['main_genre']=df['genre'].str.split(',').str[0]
df['genre_count']=df['genre'].str.count(',')+1

In [16]:
df

Unnamed: 0,series_title,released_year,runtime,genre,imdb_rating,meta_score,director,star1,no_of_votes,gross,decade,released_period,rating_category,gross category,vote_pre_millon_gross,main_genre,genre_count
0,The Shawshank Redemption,1994,142,drama,9.3,80.0,Frank Darabont,Tim Robbins,2343110,28341469.0,1990,middle,excellent,low,8.268253e+05,drama,1
1,The Godfather,1972,175,"crime, drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,1620367,134966411.0,1970,old,excellent,medium,1.200691e+05,crime,2
2,The Dark Knight,2008,152,"action, crime, drama",9.0,84.0,Christopher Nolan,Christian Bale,2303232,534858444.0,2000,new,excellent,high,4.306677e+04,action,3
3,The Godfather: Part II,1974,202,"crime, drama",9.0,90.0,Francis Ford Coppola,Al Pacino,1129952,57300000.0,1970,old,excellent,medium,1.972190e+05,crime,2
4,12 Angry Men,1957,96,"crime, drama",9.0,96.0,Sidney Lumet,Henry Fonda,689845,4360000.0,1950,old,excellent,low,1.582372e+06,crime,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,"comedy, drama, romance",7.6,76.0,Blake Edwards,Audrey Hepburn,166544,23530892.0,1960,old,high,low,7.078383e+04,comedy,3
996,Giant,1956,201,"drama, western",7.6,84.0,George Stevens,Elizabeth Taylor,34075,23530892.0,1950,old,high,low,1.448241e+04,drama,2
997,From Here to Eternity,1953,118,"drama, romance, war",7.6,85.0,Fred Zinnemann,Burt Lancaster,43374,30500000.0,1950,old,high,low,1.422241e+04,drama,3
998,Lifeboat,1944,97,"drama, war",7.6,78.0,Alfred Hitchcock,Tallulah Bankhead,26471,23530892.0,1940,old,high,low,1.125059e+04,drama,2


In [17]:
df.shape

(1000, 17)

In [18]:
df.to_csv("../data/processed/imdb_clean.csv", index=False)