In [75]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, RobustScaler

In [76]:
group_path = "group-marks.csv"
netflix_path = "netflix_titles_nov_2019.csv"

df_group = pd.read_csv(group_path)
df_netflix = pd.read_csv(netflix_path)

In [77]:
def explore(df, name, show_top=3, show_tail=2):
    print(f"=== Explore: {name} ===")
    print("Shape:", df.shape)
    display(df.head(show_top))
    display(df.tail(show_tail))
    print("\ninfo():")
    df.info()
    print("\nNumeric describe():")
    display(df.describe(include=[np.number]).round(4))
    print("\nAll describe():")
    display(df.describe(include='all').transpose().head(20))
    print("\nMissing values:")
    display(df.isnull().sum())
    print("\n---\n")

In [78]:
explore(df_group, "Group Marks (original)")
explore(df_netflix, "Netflix Titles (original)")

=== Explore: Group Marks (original) ===
Shape: (50, 10)


Unnamed: 0,rollno,name,gender,group,session,age,scholarship,math,english,urdu
0,MS01,SAADIA,female,group B,MORNING,28,2562,No Idea,72.0,74
1,MS02,JUMAIMA,female,group C,AFTERNOON,33,2800,69,90.0,88
2,MS03,ARIFA,female,,EVENING,34,3500,,95.0,93


Unnamed: 0,rollno,name,gender,group,session,age,scholarship,math,english,urdu
48,MS49,FATIMA,female,group D,MOR,40,2500,57,74.0,76
49,MS50,KAKAMANNA,male,group C,AFTERNOON,37,3000,66,78.0,81



info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rollno       50 non-null     object 
 1   name         50 non-null     object 
 2   gender       50 non-null     object 
 3   group        47 non-null     object 
 4   session      50 non-null     object 
 5   age          50 non-null     int64  
 6   scholarship  50 non-null     int64  
 7   math         46 non-null     object 
 8   english      47 non-null     float64
 9   urdu         50 non-null     int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 4.0+ KB

Numeric describe():


Unnamed: 0,age,scholarship,english,urdu
count,50.0,50.0,47.0,50.0
mean,34.36,2875.54,68.7234,65.28
std,7.7321,513.4054,13.5658,14.8792
min,19.0,2000.0,42.0,28.0
25%,29.0,2500.0,57.5,55.0
50%,33.0,3000.0,71.0,65.5
75%,38.75,3453.0,76.5,75.75
max,54.0,4000.0,95.0,93.0



All describe():


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
rollno,50.0,50.0,MS01,1.0,,,,,,,
name,50.0,44.0,FATIMA,3.0,,,,,,,
gender,50.0,2.0,female,28.0,,,,,,,
group,47.0,5.0,group C,14.0,,,,,,,
session,50.0,6.0,EVENING,10.0,,,,,,,
age,50.0,,,,34.36,7.732149,19.0,29.0,33.0,38.75,54.0
scholarship,50.0,,,,2875.54,513.405445,2000.0,2500.0,3000.0,3453.0,4000.0
math,46.0,31.0,69,4.0,,,,,,,
english,47.0,,,,68.723404,13.565785,42.0,57.5,71.0,76.5,95.0
urdu,50.0,,,,65.28,14.87916,28.0,55.0,65.5,75.75,93.0



Missing values:


rollno         0
name           0
gender         0
group          3
session        0
age            0
scholarship    0
math           4
english        3
urdu           0
dtype: int64


---

=== Explore: Netflix Titles (original) ===
Shape: (5837, 12)


Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type
0,81193313,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...",South Korea,"November 30, 2019",2019,TV-14,1 Season,"International TV Shows, Korean TV Shows, Roman...",Brought together by meaningful meals in the pa...,TV Show
1,81197050,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,,"November 30, 2019",2019,TV-G,67 min,"Documentaries, International Movies","From Sierra de las Minas to Esquipulas, explor...",Movie
2,81213894,The Zoya Factor,Abhishek Sharma,"Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, ...",India,"November 30, 2019",2019,TV-14,135 min,"Comedies, Dramas, International Movies",A goofy copywriter unwittingly convinces the I...,Movie


Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type
5835,70157452,Dinner for Five,,,United States,"February 4, 2008",2007,TV-MA,1 Season,Stand-Up Comedy & Talk Shows,"In each episode, four celebrities join host Jo...",TV Show
5836,70053412,To and From New York,Sorin Dan Mihalcescu,"Barbara King, Shaana Diya, John Krisiukenas, Y...",United States,"January 1, 2008",2006,NR,81 min,"Dramas, Independent Movies, Thrillers","While covering a story in New York City, a Sea...",Movie



info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       5837 non-null   int64 
 1   title         5837 non-null   object
 2   director      3936 non-null   object
 3   cast          5281 non-null   object
 4   country       5410 non-null   object
 5   date_added    5195 non-null   object
 6   release_year  5837 non-null   int64 
 7   rating        5827 non-null   object
 8   duration      5837 non-null   object
 9   listed_in     5837 non-null   object
 10  description   5837 non-null   object
 11  type          5837 non-null   object
dtypes: int64(2), object(10)
memory usage: 547.3+ KB

Numeric describe():


Unnamed: 0,show_id,release_year
count,5837.0,5837.0
mean,77300790.0,2013.6885
std,9479777.0,8.4191
min,269880.0,1925.0
25%,80045200.0,2013.0
50%,80163530.0,2016.0
75%,80241880.0,2018.0
max,81227200.0,2020.0



All describe():


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
show_id,5837.0,,,,77300789.821312,9479777.46861,269880.0,80045203.0,80163533.0,80241876.0,81227195.0
title,5837.0,5780.0,Tunnel,3.0,,,,,,,
director,3936.0,3108.0,"Raúl Campos, Jan Suter",18.0,,,,,,,
cast,5281.0,5087.0,David Attenborough,18.0,,,,,,,
country,5410.0,527.0,United States,1907.0,,,,,,,
date_added,5195.0,1092.0,"November 1, 2019",94.0,,,,,,,
release_year,5837.0,,,,2013.688539,8.419088,1925.0,2013.0,2016.0,2018.0,2020.0
rating,5827.0,14.0,TV-MA,1937.0,,,,,,,
duration,5837.0,194.0,1 Season,1259.0,,,,,,,
listed_in,5837.0,449.0,Documentaries,297.0,,,,,,,



Missing values:


show_id            0
title              0
director        1901
cast             556
country          427
date_added       642
release_year       0
rating            10
duration           0
listed_in          0
description        0
type               0
dtype: int64


---



In [79]:
# Part A: Group Marks
g = df_group.copy()

In [85]:
# 1) Remove exact duplicate rows
rows_before = g.shape[0]
g = g.drop_duplicates()
rows_after = g.shape[0]
print(f"Group: dropped {rows_before - rows_after} duplicate rows")

Group: dropped 0 duplicate rows


In [87]:
# 2) Normalize column names (strip, lowercase, underscores)
g.columns = [c.strip().lower().replace(' ', '_') for c in g.columns]

In [89]:
identifier_keywords = ['name','student','id','roll','reg']
identifier_cols = [c for c in g.columns if any(k in c for k in identifier_keywords)]
candidate_mark_cols = [c for c in g.columns if c not in identifier_cols]

print("Identifier columns detected:", identifier_cols)
print("Numeric candidate columns:", candidate_mark_cols)

Identifier columns detected: ['rollno', 'name']
Numeric candidate columns: ['gender', 'group', 'session', 'age', 'scholarship', 'math', 'english', 'urdu']


In [91]:
# 4) Convert candidate numeric columns to numeric 
for c in candidate_mark_cols:
    g[c] = pd.to_numeric(g[c], errors='coerce')

In [93]:
# 5) Handle missing numeric values: fill with median
numeric_cols = g.select_dtypes(include=[np.number]).columns.tolist()
for c in numeric_cols:
    med = g[c].median()
    g[c] = g[c].fillna(med)

In [95]:
# 6) Handle missing categorical: fill with mode or 'Unknown'
cat_cols = g.select_dtypes(include=['object']).columns.tolist()
for c in cat_cols:
    if g[c].isnull().any():
        mode = g[c].mode()
        fill = mode.iloc[0] if not mode.empty else 'Unknown'
        g[c] = g[c].fillna(fill)

In [97]:
# 7) Remove duplicate student rows by identifier if meaningful 
# If an identifier column exists (like roll or id), keep first occurrence:
if identifier_cols:
    primary_id = identifier_cols[0]
    before = g.shape[0]
    g = g.drop_duplicates(subset=[primary_id])
    after = g.shape[0]
    print(f"Group: dropped {before-after} duplicates by {primary_id}")

Group: dropped 0 duplicates by rollno


In [99]:
# 8) If there are multiple numeric columns that look like subject marks, create totals/averages
marks_cols = [c for c in numeric_cols if c not in ('id',)]
if marks_cols:
    g['total_marks'] = g[marks_cols].sum(axis=1)
    g['average_marks'] = g[marks_cols].mean(axis=1)
    # Example percentage (assuming each subject is out of 100)
    g['percentage'] = (g['total_marks'] / (100 * len(marks_cols))) * 100

In [101]:
# 10) Label encode student name to numeric id (factorize)
if any('name' in c for c in g.columns):
    name_col = [c for c in g.columns if 'name' in c][0]
    g['student_encoded'] = pd.factorize(g[name_col])[0]

In [103]:
# 11) Scaling numeric features: min-max for total/average
for c in ['total_marks','average_marks']:
    if c in g.columns:
        minv = g[c].min()
        maxv = g[c].max()
        if maxv - minv > 0:
            g[c + '_minmax'] = (g[c] - minv) / (maxv - minv)
        else:
            g[c + '_minmax'] = 0.0

In [105]:
# 12) Final group explore & summary
print("\n--- Cleaned Group Marks head ---")
display(g.head(6))
print("Cleaned group shape:", g.shape)
group_summary = {
    'original_rows': df_group.shape[0],
    'rows_after_dedup': rows_after,
    'columns_after': g.shape[1],
    'numeric_columns': numeric_cols,
    'marks_columns_detected': marks_cols,
    'missing_values_handled': True,
    'encoding': 'student name factorized => student_encoded (if name present)',
    'scaling': [c + '_minmax' for c in ['total_marks','average_marks'] if c in g.columns]
}


--- Cleaned Group Marks head ---


Unnamed: 0,rollno,name,gender,group,session,age,scholarship,math,english,urdu,total_marks,average_marks,percentage,student_encoded,total_marks_minmax,average_marks_minmax
0,MS01,SAADIA,,,,28,2562,64.0,72.0,74,2800.0,560.0,350.0,0,0.308023,0.308023
1,MS02,JUMAIMA,,,,33,2800,69.0,90.0,88,3080.0,616.0,385.0,1,0.441738,0.441738
2,MS03,ARIFA,,,,34,3500,64.0,95.0,93,3786.0,757.2,473.25,2,0.778892,0.778892
3,MS04,SAADIA,,,,44,2000,47.0,57.0,44,2192.0,438.4,274.0,0,0.01767,0.01767
4,MS05,DANISH,,,,54,2100,76.0,78.0,55,2363.0,472.6,295.375,3,0.099331,0.099331
5,MS06,SAFIA,,,,23,3800,64.0,83.0,78,4048.0,809.6,506.0,4,0.904011,0.904011


Cleaned group shape: (50, 16)


In [107]:
# Part B: Netflix Titles
n = df_netflix.copy()
rows_before_n = n.shape[0]
n = n.drop_duplicates()
rows_after_n = n.shape[0]
print(f"\nNetflix: dropped {rows_before_n - rows_after_n} duplicate rows")


Netflix: dropped 0 duplicate rows


In [109]:
# Normalize column names
n.columns = [c.strip().lower().replace(' ', '_') for c in n.columns]

In [111]:
# Parse date_added to datetime and extract year/month
if 'date_added' in n.columns:
    n['date_added'] = pd.to_datetime(n['date_added'], errors='coerce')
    n['added_year'] = n['date_added'].dt.year
    n['added_month'] = n['date_added'].dt.month

In [113]:
# Fill typical string columns with a safe value
for c in ['director','cast','country','rating']:
    if c in n.columns:
        n[c] = n[c].fillna('Unknown')

In [115]:
# Parse duration into minutes / seasons
if 'duration' in n.columns:
    def parse_duration(x):
        if pd.isna(x):
            return np.nan, np.nan
        s = str(x).strip()
        if 'min' in s.lower():
            try:
                minutes = int(''.join([ch for ch in s if ch.isdigit()]))
                return minutes, np.nan
            except:
                return np.nan, np.nan
        if 'season' in s.lower():
            try:
                seasons = int(''.join([ch for ch in s if ch.isdigit()]))
                return np.nan, seasons
            except:
                return np.nan, np.nan
        return np.nan, np.nan

    parsed = n['duration'].apply(parse_duration)
    n['duration_minutes'] = parsed.apply(lambda t: t[0])
    n['duration_seasons'] = parsed.apply(lambda t: t[1])

In [117]:
if 'listed_in' in n.columns:
    # split and clean
    all_genres_series = n['listed_in'].dropna().str.split(',').apply(lambda L: [s.strip() for s in L])
    top_genres = all_genres_series.explode().value_counts().head(8).index.tolist()
    for gname in top_genres:
        safe_col = 'genre_' + gname.replace(' ', '_').lower()
        n[safe_col] = n['listed_in'].fillna('').apply(lambda s: 1 if (isinstance(s, str) and gname in s) else 0)

In [119]:
# Simple label encoding of 'type' and 'rating' using factorize
if 'type' in n.columns:
    n['type_enc'] = pd.factorize(n['type'])[0]
if 'rating' in n.columns:
    n['rating_enc'] = pd.factorize(n['rating'])[0]

In [121]:
# Create number_of_cast as a simple numeric feature
if 'cast' in n.columns:
    n['num_cast'] = n['cast'].apply(lambda s: 0 if s in (np.nan, 'Unknown', '') else len([it for it in str(s).split(',') if it.strip()!='']))

In [123]:
# Robust-scaling-like transformation for duration_minutes & num_cast (median/IQR)
for c in ['duration_minutes', 'num_cast']:
    if c in n.columns:
        med = n[c].median()
        q1 = n[c].quantile(0.25)
        q3 = n[c].quantile(0.75)
        iqr = q3 - q1 if (q3 - q1) != 0 else 1.0
        n[c + '_robustscaled'] = (n[c] - med) / iqr

In [125]:
# Final netflix explore & summary
print("\n--- Cleaned Netflix head ---")
display(n.head(6))
print("Cleaned netflix shape:", n.shape)
netflix_summary = {
    'original_rows': df_netflix.shape[0],
    'rows_after_dedup': rows_after_n,
    'columns_after': n.shape[1],
    'missing_values_filled': {c: 'Unknown' for c in ['director','cast','country','rating'] if c in n.columns},
    'duration_parsed': 'duration_minutes and duration_seasons' if 'duration' in df_netflix.columns else 'not_present',
    'encoding': 'factorized type & rating, one-hot top genres',
    'scaling': [c + '_robustscaled' for c in ['duration_minutes','num_cast'] if c in n.columns]
}


--- Cleaned Netflix head ---


Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,...,genre_international_tv_shows,genre_documentaries,genre_tv_dramas,genre_action_&_adventure,genre_independent_movies,type_enc,rating_enc,num_cast,duration_minutes_robustscaled,num_cast_robustscaled
0,81193313,Chocolate,Unknown,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...",South Korea,2019-11-30,2019,TV-14,1 Season,"International TV Shows, Korean TV Shows, Roman...",...,1,0,0,0,0,0,0,8,,0.0
1,81197050,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,Unknown,2019-11-30,2019,TV-G,67 min,"Documentaries, International Movies",...,0,1,0,0,0,1,1,1,-1.071429,-1.0
2,81213894,The Zoya Factor,Abhishek Sharma,"Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, ...",India,2019-11-30,2019,TV-14,135 min,"Comedies, Dramas, International Movies",...,0,0,0,0,0,1,0,8,1.357143,0.0
3,81082007,Atlantics,Mati Diop,"Mama Sane, Amadou Mbow, Ibrahima Traore, Nicol...","France, Senegal, Belgium",2019-11-29,2019,TV-14,106 min,"Dramas, Independent Movies, International Movies",...,0,0,0,0,1,1,0,9,0.321429,0.142857
4,80213643,Chip and Potato,Unknown,"Abigail Oliver, Andrea Libman, Briana Buckmast...","Canada, United Kingdom",NaT,2019,TV-Y,2 Seasons,Kids' TV,...,0,0,0,0,0,0,2,10,,0.285714
5,81172754,Crazy people,Moses Inwang,"Ramsey Nouah, Chigul, Sola Sobowale, Ireti Doy...",Nigeria,2019-11-29,2018,TV-14,107 min,"Comedies, International Movies, Thrillers",...,0,0,0,0,0,1,0,9,0.357143,0.142857


Cleaned netflix shape: (5837, 29)


In [129]:
out_dir = Path('/mnt/data')
g.to_csv("group_marks_cleaned.csv", index=False)
n.to_csv("netflix_cleaned.csv", index=False)
print("\nSaved cleaned CSVs to:")
print("group_marks_cleaned.csv")
print("netflix_cleaned.csv")


Saved cleaned CSVs to:
group_marks_cleaned.csv
netflix_cleaned.csv
