In [1]:
# import necessary modules
import pandas as pd


In [2]:
# set file location
path_to_file = '../data/filmfreeway_archived_submission_2024-04-02.csv'

# load file to df 
column_names = ['festival_name', 'project', 'notification_date', 'submission_status', 'judging_status'] 
df = pd.read_csv(path_to_file, names=column_names, header = None)

# show first ten
df.head(10)

Unnamed: 0,festival_name,project,notification_date,submission_status,judging_status
0,Madison Film Festival,Echo,"August 5, 2024",In Consideration,Undecided
1,Cinemancia Festival Metropolitano de Cine,Echo,"July 22, 2024",In Consideration,Undecided
2,Soo Film Festival,Echo,"July 26, 2024",In Consideration,Undecided
3,Short Thread Film Festival,Echo,"May 30, 2024",In Consideration,Undecided
4,Weindorfer Short Film Festival,Echo,"April 26, 2024",In Consideration,Undecided
5,Open Festival Marseille,Echo,"June 30, 2024",In Consideration,Undecided
6,Sistas Are Doin' It For Themselves Short Film ...,Echo,"April 5, 2024",In Consideration,Undecided
7,"TRETS, Festival Internacional de Cinema i Cult...",Echo,"April 21, 2024",In Consideration,Undecided
8,FESTIVAL AUDIOVISUAL INTERNACIONAL TERRITORIO ...,Echo,"July 15, 2024",In Consideration,Undecided
9,EUROPEAN Film & Screenplay Festival,Echo,"April 10, 2024",In Consideration,Undecided


In [3]:
# verify data types
df.dtypes

festival_name        object
project              object
notification_date    object
submission_status    object
judging_status       object
dtype: object

In [4]:
# cast notification_date to datetime
df['notification_date'] = pd.to_datetime(df['notification_date'])
df.head(10)

Unnamed: 0,festival_name,project,notification_date,submission_status,judging_status
0,Madison Film Festival,Echo,2024-08-05,In Consideration,Undecided
1,Cinemancia Festival Metropolitano de Cine,Echo,2024-07-22,In Consideration,Undecided
2,Soo Film Festival,Echo,2024-07-26,In Consideration,Undecided
3,Short Thread Film Festival,Echo,2024-05-30,In Consideration,Undecided
4,Weindorfer Short Film Festival,Echo,2024-04-26,In Consideration,Undecided
5,Open Festival Marseille,Echo,2024-06-30,In Consideration,Undecided
6,Sistas Are Doin' It For Themselves Short Film ...,Echo,2024-04-05,In Consideration,Undecided
7,"TRETS, Festival Internacional de Cinema i Cult...",Echo,2024-04-21,In Consideration,Undecided
8,FESTIVAL AUDIOVISUAL INTERNACIONAL TERRITORIO ...,Echo,2024-07-15,In Consideration,Undecided
9,EUROPEAN Film & Screenplay Festival,Echo,2024-04-10,In Consideration,Undecided


In [5]:
# re-verify data types
df.dtypes

festival_name                object
project                      object
notification_date    datetime64[ns]
submission_status            object
judging_status               object
dtype: object

In [6]:
# extract year and month 
df['notification_date_month'] = pd.DatetimeIndex(df['notification_date']).month
df['notification_date_year'] = pd.DatetimeIndex(df['notification_date']).year
df.head(10)

Unnamed: 0,festival_name,project,notification_date,submission_status,judging_status,notification_date_month,notification_date_year
0,Madison Film Festival,Echo,2024-08-05,In Consideration,Undecided,8,2024
1,Cinemancia Festival Metropolitano de Cine,Echo,2024-07-22,In Consideration,Undecided,7,2024
2,Soo Film Festival,Echo,2024-07-26,In Consideration,Undecided,7,2024
3,Short Thread Film Festival,Echo,2024-05-30,In Consideration,Undecided,5,2024
4,Weindorfer Short Film Festival,Echo,2024-04-26,In Consideration,Undecided,4,2024
5,Open Festival Marseille,Echo,2024-06-30,In Consideration,Undecided,6,2024
6,Sistas Are Doin' It For Themselves Short Film ...,Echo,2024-04-05,In Consideration,Undecided,4,2024
7,"TRETS, Festival Internacional de Cinema i Cult...",Echo,2024-04-21,In Consideration,Undecided,4,2024
8,FESTIVAL AUDIOVISUAL INTERNACIONAL TERRITORIO ...,Echo,2024-07-15,In Consideration,Undecided,7,2024
9,EUROPEAN Film & Screenplay Festival,Echo,2024-04-10,In Consideration,Undecided,4,2024


In [7]:
# find number of festival for each month of each year
df_aggregated = pd.DataFrame(df.groupby(by=['notification_date_year', 'notification_date_month'])['festival_name'] \
                             .count() \
                             .reset_index()
                             )

# rename column  
df_aggregated.rename(columns={'festival_name' : 'festival_count'}, inplace=True)

# order by year and then month
df_aggregated.sort_values([	'notification_date_year', 'notification_date_month'], inplace=True)
df_aggregated.head(10)

Unnamed: 0,notification_date_year,notification_date_month,festival_count
0,2020,12,1
1,2022,6,1
2,2022,7,3
3,2022,9,5
4,2022,10,3
5,2023,2,2
6,2023,4,4
7,2023,5,11
8,2023,6,25
9,2023,7,33


In [8]:
# Generate data for the remaining years and months without festival

# 1st step : convert to df_aggregated to dictionary in this form --> {(year, month): festival_count}
# Set the combination of 'notification_date_year' and 'notification_date_month' as index
df_aggregated.set_index(['notification_date_year', 'notification_date_month'], inplace=True)
# Convert to a dictionary
festival_count_dict = df_aggregated['festival_count'].to_dict()
print(festival_count_dict)

# 2nd step :
# Create a list to hold all rows
festival = []

# 3rd step :
# Loop through years and months
for year in range(2020, 2025):
    for month in range(1, 13):
        # Check if festival count is available for the given year and month
        if (year, month) in festival_count_dict:
            festival_count = festival_count_dict[(year, month)]
        else:
            festival_count = 0
        festival.append([year, month, festival_count])

# recreate df but this time complete with each year and month
df_complete = pd.DataFrame(festival, columns=['notification_date_year', 'notification_date_month', 'festival_count'])
df_complete

{(2020, 12): 1, (2022, 6): 1, (2022, 7): 3, (2022, 9): 5, (2022, 10): 3, (2023, 2): 2, (2023, 4): 4, (2023, 5): 11, (2023, 6): 25, (2023, 7): 33, (2023, 8): 48, (2023, 9): 43, (2023, 10): 54, (2023, 11): 21, (2023, 12): 14, (2024, 1): 14, (2024, 2): 17, (2024, 3): 24, (2024, 4): 27, (2024, 5): 21, (2024, 6): 16, (2024, 7): 16, (2024, 8): 9, (2024, 9): 3, (2024, 10): 5, (2024, 11): 3}


Unnamed: 0,notification_date_year,notification_date_month,festival_count
0,2020,1,0
1,2020,2,0
2,2020,3,0
3,2020,4,0
4,2020,5,0
5,2020,6,0
6,2020,7,0
7,2020,8,0
8,2020,9,0
9,2020,10,0


In [9]:
# "quality check"
# s'il y a des mois sans festival
print('year-month without festival :')
for i in range(len(df_complete)):
    if df_complete['festival_count'][i] == 0 :
        print(f"{df_complete['notification_date_year'][i]} - {df_complete['notification_date_month'][i]}")

year-month without festival :
2020 - 1
2020 - 2
2020 - 3
2020 - 4
2020 - 5
2020 - 6
2020 - 7
2020 - 8
2020 - 9
2020 - 10
2020 - 11
2021 - 1
2021 - 2
2021 - 3
2021 - 4
2021 - 5
2021 - 6
2021 - 7
2021 - 8
2021 - 9
2021 - 10
2021 - 11
2021 - 12
2022 - 1
2022 - 2
2022 - 3
2022 - 4
2022 - 5
2022 - 8
2022 - 11
2022 - 12
2023 - 1
2023 - 3
2024 - 12
