## Research question1

**1)What is the distribution of entities in the director-crew dataset:**
* Number of featured movies?
* Number of directors, number movies per director, and average number of movies per director?
* Number of crews?
* Number of roles and frequency of each role?

In [3]:
import os
import pandas as pd
import json

main_directory = r'C:\Users\nayak\OneDrive\Desktop\Network-Analysis-data-641-01\group_project\film-directors'

def count_directors_and_movies(main_directory):
    directors_data = []
    total_movies = 0
    unique_movies = set()  # Set to store unique movie titles
    for director_dir in os.listdir(main_directory):
        director_path = os.path.join(main_directory, director_dir)
        if os.path.isdir(director_path):
            normalized_data_dir = os.path.join(director_path, 'normalized_data')
            if os.path.exists(normalized_data_dir) and os.path.isdir(normalized_data_dir):
                featured_movies = [file for file in os.listdir(normalized_data_dir) if file.endswith('.json')]
                for file in featured_movies:
                    title_id = file.split('_')[0]
                    unique_movies.add(title_id)  # Add the movie to the set
                # num_movies = len(unique_movies) # Add the movie to the set
                num_movies1 = len(featured_movies)

                directors_data.append({'Director ID': director_dir, 'Number of Movies': num_movies1})
                # unique_movies.clear()  # Clear the set for the next director
    
    total_movies = len(unique_movies)
    df_directors = pd.DataFrame(directors_data)
    num_directors = len(df_directors)
    avg_movies_per_director = total_movies / num_directors if num_directors > 0 else 0
    
    return df_directors, total_movies, avg_movies_per_director

directors_df, total_movies, avg_movies_per_director = count_directors_and_movies(main_directory)

print(f"Number of directors: {len(directors_df)}")
print(f"Total no. of featured movies is: {total_movies}")
print(f"Average number of movies per director is: {avg_movies_per_director}")
print(f"Number of movies per director:\n{directors_df.to_string(index=False)}")


Number of directors: 101
Total no. of featured movies is: 1383
Average number of movies per director is: 13.693069306930694
Number of movies per director:
Director ID  Number of Movies
  nm0000095                53
  nm0000116                12
  nm0000142                41
  nm0000165                36
  nm0000186                21
  nm0000217                40
  nm0000229                38
  nm0000231                25
  nm0000233                14
  nm0000318                20
  nm0000338                28
  nm0000343                23
  nm0000361                32
  nm0000386                18
  nm0000399                20
  nm0000464                17
  nm0000487                14
  nm0000490                41
  nm0000500                24
  nm0000517                10
  nm0000520                13
  nm0000600                17
  nm0000631                30
  nm0000709                25
  nm0000759                 9
  nm0000777                11
  nm0000876                13
  nm0

In [70]:
crews_data = []

for director_dir in os.listdir(main_directory):
    director_path = os.path.join(main_directory, director_dir)
    if os.path.isdir(director_path):
        normalized_data_dir = os.path.join(director_path, 'normalized_data')
        if os.path.exists(normalized_data_dir) and os.path.isdir(normalized_data_dir):
            for file_name in os.listdir(normalized_data_dir):
                if file_name.endswith('.json'):
                    file_path = os.path.join(normalized_data_dir, file_name)
                    with open(file_path, 'r', encoding='utf-8') as file:
                        data = json.load(file)
                        title_id = file_name.split('_')[0]
                        for credit in data['full_credits']:
                            role = credit['role']
                            normalized_role = credit.get('normalized role', '')
                            for crew_member in credit['crew']:
                                crew_data = {
                                    'Director ID': director_dir,
                                    'Title ID': title_id,
                                    'Role': role,
                                    'Normalized Role': normalized_role,
                                    'Crew Name': crew_member['name'],
                                    'Crew Link': crew_member['link'],
                                    'Subrole': crew_member.get('normalized_credit', '')
                                }
                                crews_data.append(crew_data)

crews_df = pd.DataFrame(crews_data)


In [71]:
## Formating the contents of Subrole column as it is in list format
import re
def extract_letters(subrole):
    if isinstance(subrole, list):
        # If subrole is a list, join its elements into a single string
        subrole_str = ' '.join(subrole)
    else:
        subrole_str = subrole
    
    # Remove non-alphabetic characters from the string
    return re.sub(r'[^a-zA-Z\s]', '', subrole_str)
    # return re.sub(r'[^a-zA-Z]', '', subrole_str)

# Test the function
crews_df['Subrole'] = crews_df['Subrole'].apply(extract_letters)
crews_df['Subrole']
crews_df.to_csv('crew.csv', index=False)
crews_df


Unnamed: 0,Director ID,Title ID,Role,Normalized Role,Crew Name,Crew Link,Subrole
0,nm0000095,tt0061177,Directed by,Directed by,Woody Allen,https://www.imdb.com/name/nm0000095/?ref_=ttfc...,aided abetted by
1,nm0000095,tt0061177,Directed by,Directed by,Senkichi Taniguchi,https://www.imdb.com/name/nm0849474/?ref_=ttfc...,original japanese version
2,nm0000095,tt0061177,Writing Credits,Writing Credits,Woody Allen,https://www.imdb.com/name/nm0000095/?ref_=ttfc...,special material by
3,nm0000095,tt0061177,Writing Credits,Writing Credits,Frank Buxton,https://www.imdb.com/name/nm0125518/?ref_=ttfc...,with writings by
4,nm0000095,tt0061177,Writing Credits,Writing Credits,Louise Lasser,https://www.imdb.com/name/nm0489837/?ref_=ttfc...,with writings by
...,...,...,...,...,...,...,...
687853,nm3363032,tt9114286,Thanks,Thanks,Larry Stroman,https://www.imdb.com/name/nm7975106/?ref_=ttfc...,with special thanks to
687854,nm3363032,tt9114286,Thanks,Thanks,Mark Texeira,https://www.imdb.com/name/nm4088173/?ref_=ttfc...,with special thanks to
687855,nm3363032,tt9114286,Thanks,Thanks,Jocelyn Thomas,https://www.imdb.com/name/nm0859036/?ref_=ttfc...,special thanks
687856,nm3363032,tt9114286,Thanks,Thanks,Roy Thomas,https://www.imdb.com/name/nm0859471/?ref_=ttfc...,with special thanks to


In [73]:
### Filtering the roles that we want
roles_to_include = [
    'Casting By', 
    'Cinematography by', 
    'Costume Design by', 
    'Directed by', 
    'Film Editing by', 
    'Music by', 
    'Production Design by', 
    'Writing Credits'
]

filtered_df = crews_df[crews_df['Normalized Role'].isin(roles_to_include)]
print(len(filtered_df))

makeup_department_df = crews_df[
    (crews_df['Normalized Role'] == 'Makeup Department') &
    ((crews_df['Subrole'] == 'hair department head') | (crews_df['Subrole'] == 'makeup department head'))
]

print(len(makeup_department_df))

produced_by_df = crews_df[
    (crews_df['Normalized Role'] == 'Produced by') &
    ((crews_df['Subrole'] == 'producer') | (crews_df['Subrole'] == 'producer produced by')|(crews_df['Subrole'] == 'producer produced by pga'))
]
print(len(produced_by_df))

Sound_department_df = crews_df[
    (crews_df['Normalized Role'] == 'Sound Department') &
    ((crews_df['Subrole'] == 'rerecording mixer') | (crews_df['Subrole'] == 'sound designer')|(crews_df['Subrole'] == 'supervising sound editor'))
]
print(len(Sound_department_df))

special_effects_df = crews_df[
    (crews_df['Normalized Role'] == 'Special Effects by') &
    ((crews_df['Subrole'] == 'special effects supervisor') | (crews_df['Subrole'] == 'visual effects supervisor'))
]

print(len(special_effects_df))

14558
844
3860
3500
624


In [74]:
# Concatenate all the dataframes
all_roles_df = pd.concat([makeup_department_df, produced_by_df, Sound_department_df, special_effects_df, filtered_df])

# Reset the index of the concatenated dataframe
all_roles_df = all_roles_df.reset_index(drop=True)

# Print the length of the concatenated dataframe
print(len(all_roles_df))

# Print the concatenated dataframe
print(all_roles_df)


23386
      Director ID   Title ID                  Role       Normalized Role  \
0       nm0000095  tt0313792     Makeup Department     Makeup Department   
1       nm0000095  tt2334873     Makeup Department     Makeup Department   
2       nm0000095  tt2334873     Makeup Department     Makeup Department   
3       nm0000095  tt2334873     Makeup Department     Makeup Department   
4       nm0000095  tt2334873     Makeup Department     Makeup Department   
...           ...        ...                   ...                   ...   
23381   nm3363032  tt9114286              Music by              Music by   
23382   nm3363032  tt9114286     Cinematography by     Cinematography by   
23383   nm3363032  tt9114286            Casting By            Casting By   
23384   nm3363032  tt9114286  Production Design by  Production Design by   
23385   nm3363032  tt9114286     Costume Design by     Costume Design by   

                   Crew Name  \
0                 Lori Hicks   
1             Gre

In [75]:
# Drop duplicates based on 'Name' and 'Credit' columns to consider unique crew members
unique_crew_df = all_roles_df.drop_duplicates(subset=['Crew Name'])

# Get the total number of unique crew members
total_unique_crew = len(unique_crew_df)
print(f"Total number of unique crew members: {total_unique_crew}")


Total number of unique crew members: 8004


In [76]:
# Calculate the frequency of each role
role_frequency = all_roles_df['Normalized Role'].value_counts()

# Print the number of roles
print("Number of roles:", len(role_frequency))

# Print the frequency of each role
print("Frequency of each role:")
print(role_frequency)


Number of roles: 11
Frequency of each role:
Normalized Role
Produced by             3860
Writing Credits         3820
Sound Department        3500
Directed by             2742
Cinematography by       1940
Casting By              1909
Music by                1523
Production Design by    1359
Costume Design by       1265
Makeup Department        844
Special Effects by       624
Name: count, dtype: int64


In [79]:
# Define the roles and corresponding subroles
roles_subroles = {
    'Makeup Department': ['hair department head', 'makeup department head'],
    'Produced by': ['producer', 'producer produced by', 'producer produced by pga'],
    'Sound Department': ['rerecording mixer', 'sound designer', 'supervising sound editor'],
    'Special Effects by': ['special effects supervisor', 'visual effects supervisor']
}

# Initialize a dictionary to store the frequency of subroles
subrole_frequency = {}

# Calculate the frequency of each subrole for the specified roles
for role, subroles in roles_subroles.items():
    for subrole in subroles:
        count = len(all_roles_df[(all_roles_df['Normalized Role'] == role) & (all_roles_df['Subrole'] == subrole)])
        subrole_frequency[f"{role} - {subrole}"] = count

# Print the frequency of each subrole
print("Frequency of each subrole:")
for subrole, count in subrole_frequency.items():
    print(f"{subrole}: {count}")


Frequency of each subrole:
Makeup Department - hair department head: 383
Makeup Department - makeup department head: 461
Produced by - producer: 3057
Produced by - producer produced by: 592
Produced by - producer produced by pga: 211
Sound Department - rerecording mixer: 2113
Sound Department - sound designer: 536
Sound Department - supervising sound editor: 851
Special Effects by - special effects supervisor: 623
Special Effects by - visual effects supervisor: 1


###**Q2. How will you characterize the film-director network? What are the properties (degree dist., average shortest path length, triangles aka clustering coefficient, density/sparcity)?**

In [83]:
# # Create a new column 'Unique ID' in the DataFrame
# crews_df['Unique ID'] = crews_df.index.astype(str) + crews_df['Director ID'].str[-4:] + crews_df['Title ID'].str[-4:]

# # Print the DataFrame with the new 'Unique ID' column
# print(crews_df[['Director ID', 'Title ID', 'Unique ID']])


In [None]:
# import networkx as nx

# # Assuming you have a NetworkX graph representing the film-director network
# # Calculate degree distribution
# degree_dist = nx.degree_histogram(graph)

# # Calculate average shortest path length
# avg_shortest_path_length = nx.average_shortest_path_length(graph)

# # Calculate clustering coefficient
# clustering_coefficient = nx.average_clustering(graph)

# # Calculate network density
# density = nx.density(graph)

# print("Degree Distribution:", degree_dist)
# print("Average Shortest Path Length:", avg_shortest_path_length)
# print("Clustering Coefficient:", clustering_coefficient)
# print("Density:", density)
