In [1]:
import pandas as pd
import re
from datetime import datetime

In [2]:
# Mapping for French month names to numbers
month_mapping = {
    'janvier': '01',
    'février': '02',
    'mars': '03',
    'avril': '04',
    'mai': '05',
    'juin': '06',
    'juillet': '07',
    'août': '08',
    'septembre': '09',
    'octobre': '10',
    'novembre': '11',
    'décembre': '12'
}

def parse_release_date(date_str):
    # Converts "12 mars 2025" into "2025-03-12"
    if pd.isna(date_str):
        return None
    parts = date_str.strip().split()
    if len(parts) != 3:
        return date_str
    day, month_fr, year = parts
    month = month_mapping.get(month_fr.lower(), '01')
    return f"{year}-{month}-{day.zfill(2)}"

In [3]:
def duration_to_minutes(duration_str):
    # Converts "1h 26min" into integer minutes (86)
    if pd.isna(duration_str):
        return None
    match = re.search(r'(\d+)\s*h\s*(\d+)?', duration_str)
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2)) if match.group(2) else 0
        return hours * 60 + minutes
    return None

In [4]:
def extract_number(text):
    # Remove any spaces and then check if the remaining text is a number
    if pd.isna(text):
        return None
    cleaned = text.replace(" ", "")
    try:
        return int(cleaned)
    except ValueError:
        # Fallback: extract the first sequence of digits if conversion fails
        import re
        match = re.search(r'(\d+)', cleaned)
        return int(match.group(1)) if match else None

In [5]:
def split_list(text, separator=','):
    # Splits a string into a trimmed list
    if pd.isna(text) or text.strip() == "":
        return []
    return [x.strip() for x in text.split(separator) if x.strip()]

In [6]:
def clean_awards(text):
    # Separates awards and nominations from a string like "1 prix et 1 nomination" or "20 nominations"
    awards = None
    nominations = None
    if pd.isna(text) or text.strip() == "":
        return awards, nominations
    match_awards = re.search(r'(\d+)\s*prix', text, re.IGNORECASE)
    match_nominations = re.search(r'(\d+)\s*nomination', text, re.IGNORECASE)
    if match_awards:
        awards = int(match_awards.group(1))
    if match_nominations:
        nominations = int(match_nominations.group(1))
    return awards, nominations

In [7]:
def clean_viewer_critics(text):
    # Splits a string like "566 notes, 241 critiques" into a tuple (notes, critiques)
    notes = None
    critiques = None
    if pd.isna(text) or text.strip()=="":
        return notes, critiques
    match_notes = re.search(r'(\d+)\s*notes', text, re.IGNORECASE)
    match_critiques = re.search(r'(\d+)\s*critiques', text, re.IGNORECASE)
    if match_notes:
        notes = int(match_notes.group(1))
    if match_critiques:
        critiques = int(match_critiques.group(1))
    return notes, critiques

In [8]:
def parse_week_period(period_text):
    """
    Given a French week period string like "05 au 12 mars 2025",
    extract start and end dates and return a tuple:
      (formatted_period, iso_week_number)
    where formatted_period is "dd/mm/yyyy – dd/mm/yyyy" using the same month/year for both dates.
    """
    if pd.isna(period_text) or period_text.strip() == "":
        return None, None
    # Pattern expecting something like: "05 au 12 mars 2025"
    pattern = r'(\d{1,2})\s*au\s*(\d{1,2})\s*(\w+)\s*(\d{4})'
    match = re.search(pattern, period_text.strip(), re.IGNORECASE)
    if not match:
        return period_text, None
    start_day, end_day, month_fr, year = match.groups()
    month = month_mapping.get(month_fr.lower(), '01')
    start_date_str = f"{start_day.zfill(2)}/{month}/{year}"
    end_date_str = f"{end_day.zfill(2)}/{month}/{year}"
    # Determine ISO week number using the start date
    try:
        dt_start = datetime.strptime(f"{year}-{month}-{start_day.zfill(2)}", "%Y-%m-%d")
        iso_week = dt_start.isocalendar()[1]
    except Exception:
        iso_week = None
    formatted_period = f"{start_date_str} – {end_date_str}"
    return formatted_period, iso_week

In [9]:
def get_season(date_str):
    """
    Given a date string in "YYYY-MM-DD", returns the season (Winter, Spring, Summer, Autumn).
    """
    if pd.isna(date_str):
        return None
    try:
        dt = datetime.strptime(date_str, "%Y-%m-%d")
    except ValueError:
        return None
    month = dt.month
    # Northern hemisphere seasons
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Autumn"

In [10]:
def clean_data(df):
    # film_title: remove extra spaces
    df['film_title'] = df['film_title'].str.strip()
    
    # film_url: extract film_id
    df['film_id'] = df['film_url'].apply(
        lambda x: re.search(r'fichefilm_gen_cfilm=(\d+)', x).group(1) if re.search(r'fichefilm_gen_cfilm=(\d+)', x) else None
    )
    
    # release_date: reformat date
    df['release_date'] = df['release_date'].apply(parse_release_date)
    
    # Derive season from release_date
    df['release_season'] = df['release_date'].apply(get_season)
    
    # duration: convert to minutes
    df['duration_minutes'] = df['duration'].apply(duration_to_minutes)
    
    # age_classification: remove extra spaces
    df['age_classification'] = df['age_classification'].str.strip()
    
    # producers: split and expand into columns
    producers_split = df['producers'].apply(lambda x: split_list(x))
    df['producers_count'] = producers_split.apply(len)
    max_producers = producers_split.apply(len).max()
    for i in range(max_producers):
        df[f'producer_{i+1}'] = producers_split.apply(lambda x: x[i] if len(x) > i else None)
    
    # director: clean spaces
    df['director'] = df['director'].str.strip()
    
    # top_stars: split and expand into columns
    top_stars_split = df['top_stars'].apply(lambda x: split_list(x))
    df['top_stars_count'] = top_stars_split.apply(len)
    max_stars = top_stars_split.apply(len).max()
    for i in range(max_stars):
        df[f'top_star_{i+1}'] = top_stars_split.apply(lambda x: x[i] if len(x) > i else None)
    
    # languages: count number of languages instead of list
    df['language_count'] = df['languages'].apply(lambda x: len(split_list(x, separator=",")))
    
    # film_nationality: count number of nationalities
    df['film_nationality_count'] = df['film_nationality'].apply(lambda x: len(split_list(x, separator=",")))
    
    # filming_secrets: extract number and cast to int (if missing, set to 0)
    df['filming_secrets_num'] = df['filming_secrets'].apply(lambda x: extract_number(x))
    df['filming_secrets_num'] = df['filming_secrets_num'].apply(lambda x: int(x) if pd.notna(x) else 0)
    
    # Process week periods for fr_entry_week and us_entry_week
    for col in ['fr_entry_week', 'us_entry_week']:
        period, iso_week = zip(*df[col].apply(lambda x: parse_week_period(x)))
        df[f'{col}_period'] = period
        df[f'{col}_iso_week'] = [int(i) if pd.notna(i) else 0 for i in iso_week]
    
    # Process fr_entries and us_entries as numbers (remove spaces and cast to int)
    for col in ['fr_entries', 'us_entries']:
        df[f'{col}_num'] = df[col].apply(lambda x: extract_number(x))
        df[f'{col}_num'] = df[f'{col}_num'].apply(lambda x: int(x) if pd.notna(x) else 0)
    
    # awards: separate awards and nominations and cast to int (defaulting to 0)
    awards_data = df['awards'].apply(clean_awards)
    df['award_count'] = awards_data.apply(lambda x: int(x[0]) if pd.notna(x[0]) else 0)
    df['nomination_count'] = awards_data.apply(lambda x: int(x[1]) if pd.notna(x[1]) else 0)
    # Create total awards+nomination column, as int
    df['total_awards_nomination'] = df.apply(lambda row: int(row['award_count']) + int(row['nomination_count']), axis=1)
    
    # associated_genres: keep only count (drop the list version)
    df['associated_genres_count'] = df['associated_genres'].apply(lambda x: len(split_list(x, separator=",")))
    
    # press_critics_count: extract number and cast to int
    df['press_critics_count_num'] = df['press_critics_count'].apply(lambda x: extract_number(x))
    df['press_critics_count_num'] = df['press_critics_count_num'].apply(lambda x: int(x) if pd.notna(x) else 0)
    
    # viewer_critics_count: split into notes and critiques and cast them to int
    viewer_data = df['viewer_critics_count'].apply(clean_viewer_critics)
    df['viewer_notes'] = viewer_data.apply(lambda x: int(x[0]) if pd.notna(x[0]) else 0)
    df['viewer_critiques'] = viewer_data.apply(lambda x: int(x[1]) if pd.notna(x[1]) else 0)
    
    # synopsis: add a column for the length of the synopsis
    df['synopsis_length'] = df['synopsis'].apply(lambda x: len(x) if pd.notna(x) else 0)
    
    return df

if __name__ == '__main__':
    input_path = '/Users/michaeladebayo/Documents/Simplon/brief_projects/movie_prediction/scraping/allocinescraper/films.csv'
    output_path = '/Users/michaeladebayo/Documents/Simplon/brief_projects/movie_prediction/machine_learning/films_cleaned.csv'
    
    df = pd.read_csv(input_path)
    df_cleaned = clean_data(df)
    df_cleaned.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")

Cleaned data saved to /Users/michaeladebayo/Documents/Simplon/brief_projects/movie_prediction/machine_learning/films_cleaned.csv


In [11]:
pd.set_option('display.max_columns', None)

df_cleaned.head()

Unnamed: 0,film_title,film_url,film_image_url,release_date,duration,age_classification,producers,director,top_stars,press_rating,viewer_rating,languages,distributor,year_of_production,film_nationality,filming_secrets,fr_entry_week,us_entry_week,fr_entries,us_entries,awards,budget,associated_genres,press_critics_count,viewer_critics_count,synopsis,film_id,release_season,duration_minutes,producers_count,producer_1,producer_2,producer_3,producer_4,producer_5,top_stars_count,top_star_1,top_star_2,top_star_3,language_count,film_nationality_count,filming_secrets_num,fr_entry_week_period,fr_entry_week_iso_week,us_entry_week_period,us_entry_week_iso_week,fr_entries_num,us_entries_num,award_count,nomination_count,total_awards_nomination,associated_genres_count,press_critics_count_num,viewer_notes,viewer_critiques,synopsis_length
0,The Electric State,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/img/91...,2025-03-14,2h 08min,,"Joe Russo,Anthony Russo,Christopher Markus,Ste...",Joe Russo,"Millie Bobby Brown,Chris Pratt,Anthony Mackie",2.0,2.6,Anglais,,2025,,-,,,,,,-,"Aventure,Science Fiction",13 critiques,"1149 notes, 160 critiques",Une adolescente réalise que son nouvel ami rob...,261389,Spring,128.0,4,Joe Russo,Anthony Russo,Christopher Markus,Stephen McFeely,,3,Millie Bobby Brown,Chris Pratt,Anthony Mackie,1,0,0,,0,,0,0,0,0,0,0,2,13,1149,160,210
1,Un parfait inconnu,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/img/7b...,2025-01-29,2h 20min,Tout public,"James Mangold,Jay Cocks",James Mangold,"Timothée Chalamet,Edward Norton,Elle Fanning",3.9,4.1,Anglais,The Walt Disney Company France,2024,,10 anecdotes,29 janvier au 5 février 2025,27 au 30 décembre 2024,308 895,11 655 553,20 nominations,-,"Biopic,Drame,Musical",38 critiques,"5102 notes, 563 critiques","New York, 1961. Alors que la scène musicale es...",280195,Winter,140.0,2,James Mangold,Jay Cocks,,,,3,Timothée Chalamet,Edward Norton,Elle Fanning,1,0,10,29 janvier au 5 février 2025,0,27/12/2024 – 30/12/2024,52,308895,11655553,0,20,20,3,38,5102,563,529
2,Dis-moi juste que tu m'aimes,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img5.acsta.net/c_310_420/img/42...,2025-02-19,1h 51min,Tout public,Anne Le Ny,Anne Le Ny,"Omar Sy,Élodie Bouchez,Vanessa Paradis",3.0,3.0,Français,SND,2024,,9 anecdotes,05 au 12 mars 2025,,62 556,,,-,Drame,25 critiques,"1029 notes, 163 critiques","Au bout de quinze ans de mariage, une crise me...",321257,Winter,111.0,1,Anne Le Ny,,,,,3,Omar Sy,Élodie Bouchez,Vanessa Paradis,1,0,9,05/03/2025 – 12/03/2025,10,,0,62556,0,0,0,0,1,25,1029,163,554
3,Magma,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img6.acsta.net/c_310_420/img/01...,2025-03-19,1h 25min,Tout public,"Cyprien Vial,Nicolas Pleskof",Cyprien Vial,"Marina Foïs,Théo Christine,Mathieu Demy",2.9,3.2,Français,Pyramide Distribution,2025,,6 anecdotes,,,,,,-,Drame,29 critiques,"315 notes, 68 critiques",Katia Reiter dirige l’Observatoire Volcanologi...,318223,Spring,85.0,2,Cyprien Vial,Nicolas Pleskof,,,,3,Marina Foïs,Théo Christine,Mathieu Demy,1,0,6,,0,,0,0,0,0,0,0,1,29,315,68,407
4,Les Condés,https://www.allocine.fr/film/fichefilm_gen_cfi...,https://fr.web.img2.acsta.net/c_310_420/img/58...,2025-03-26,1h 24min,Tout public,"Nordine Salhi,Ryad Luc Montel",Nordine Salhi,"Nordine Salhi,Ichem Bougheraba,Arriles Amrani",1.6,1.6,Français,Apollo Films,2025,,6 anecdotes,,,,,,-,Comédie,5 critiques,"115 notes, 27 critiques","À Marseille, la Police Nationale n’y arrive pl...",1000017317,Spring,84.0,2,Nordine Salhi,Ryad Luc Montel,,,,3,Nordine Salhi,Ichem Bougheraba,Arriles Amrani,1,0,6,,0,,0,0,0,0,0,0,1,5,115,27,302
