<a href="https://colab.research.google.com/github/JManas02/Data-Science/blob/main/network_data_with_Kumu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
# !rm *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

total 2311900
drwxr-xr-x 1 root root      4096 Jul 30 13:27 .
drwxr-xr-x 1 root root      4096 Jul 30 13:21 ..
drwxr-xr-x 4 root root      4096 Jul 27 13:36 .config
-rw-r--r-- 1 root root 251516986 Jul 29 13:25 name.basics.tsv.gz
-rw-r--r-- 1 root root 251516986 Jul 29 13:25 name.basics.tsv.gz.1
drwxr-xr-x 1 root root      4096 Jul 27 13:37 sample_data
-rw-r--r-- 1 root root 310734144 Jul 29 13:25 title.akas.tsv.gz
-rw-r--r-- 1 root root 310734144 Jul 29 13:25 title.akas.tsv.gz.1
-rw-r--r-- 1 root root 175310103 Jul 29 13:25 title.basics.tsv.gz
-rw-r--r-- 1 root root 175310103 Jul 29 13:25 title.basics.tsv.gz.1
-rw-r--r-- 1 root root 446110311 Jul 29 13:25 title.principals.tsv.gz
-rw-r--r-- 1 root root 446110311 Jul 29 13:25 title.principals.tsv.gz.1


In [None]:
title=pd.read_csv('title.basics.tsv.gz',sep='\t',low_memory=False).set_index('tconst')[['titleType','primaryTitle','startYear']]

In [None]:
title['startYear'] = title['startYear'].replace("\\N",regex=True)
title['startYear']=title['startYear'].astype('int')

In [None]:
title=title[title['startYear']>=1995]

In [None]:
title.shape

(8199840, 3)

In [None]:
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0011801,movie,Tötet nicht mehr,2019
tt0013274,movie,Istoriya grazhdanskoy voyny,2021
tt0015414,movie,La tierra de los toros,2000
tt0034413,short,Youth Gets a Break,2001
tt0035423,movie,Kate & Leopold,2001


In [None]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t',usecols=['tconst', 'nconst', 'category'])# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
cast = cast[cast.category.isin({'actor', 'actress'})]
cast.head()

Unnamed: 0,tconst,nconst,category
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor
21,tt0000008,nm0653028,actor


In [None]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
cast = cast[cast['tconst'].isin(movies.index)]
# This is what the network looks like
cast.head()

Unnamed: 0,tconst,nconst,category
80741,tt0011801,nm0459029,actor
80742,tt0011801,nm0681726,actor
80743,tt0011801,nm0692612,actress
80744,tt0011801,nm0726256,actor
80745,tt0011801,nm0776458,actor


In [None]:
# Restrict data to just a single region (e.g. IN, US, etc)
# This loads the region for each title
region = pd.read_csv('title.akas.tsv.gz', sep='\t', low_memory=False,usecols=['titleId','region']).set_index('titleId')['region']
region.value_counts().head(10)

In [None]:
# Load the name data along with birth year
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]

In [None]:
name.head()

In [None]:
from scipy.sparse import csr_matrix

In [None]:
def get_pairs(lang=None, min_acted=25, min_pairings=1):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
    name_freq = graph['nconst'].value_counts()
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [None]:
pairs, cat = get_pairs(lang='IN', min_acted=10, min_pairings=3)

In [None]:
ForKumu = lookup(pairs, cat)
ForKumu

In [None]:
ForKumu = ForKumu[['name1', 'name2', 'count']]
ForKumu = ForKumu.rename(columns={'name1':'From',
                                  'name2':'To',
                                  'count':'Strength'})
ForKumu


Unnamed: 0,From,To,Strength
275,Brahmanandam,Mohammad Ali,34
531,Mohammad Ali,Brahmanandam,34
268,Raghu Babu,Mohammad Ali,20
1851,Uttar Kumar,Kavita Joshi,20
4859,Kavita Joshi,Uttar Kumar,20
...,...,...,...
2026,Kota Srinivasa Rao,Raghu Babu,3
2029,Apoorva,Raghu Babu,3
2030,Sivaji,Raghu Babu,3
2031,Banerjee,Raghu Babu,3


In [None]:
ForKumu.to_excel("pairs_1.xlsx", index = False)