In [3]:
import csv
import webget
import gzip
import shutil
from collections import Counter

#Downloads the compressed file
url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
webget.download(url)

filename = "./title.basics.tsv.gz"
#Creates a check to skip first element
check = False
#Creates empty lists for the data we are looking for
releasedates = []
enddates = []
genres = []

#Unzip our gz file, and save it in the root directory (same directory as this python file)
with gzip.open('title.basics.tsv.gz', 'rb') as f_in:
    with open('title.basics.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

#Read our dataset from the tsv file we unzipped
with open('title.basics.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    #Iterate through our tsv file (dataset)
    for i,row in enumerate(reader):
        #Checks if check is true, if true it is not the first element, thus we are interested in the data.
        if(check):
            #checks if the type of data is a movie (assuming "short" is considered a movie (short movie))
            if row[1] in ('movie', 'short'):
                #Some entries have nothing in the releasedate field (row[5]) and have \\N instead.
                #So we ensure that we skip elements that are equal to \\N
                if row[5] != '\\N':
                    #Due to a VERY big dataset, we only want to count data after the year 2000
                    if int(row[5]) > 2000:
                        #inserts the genre (row[8] and releasedate (row[5]) in their respective lists)
                        genres.append(row[8])
                        releasedates.append(row[5])
            #checks if the type is a serie or tvserie
            if 'Serie' in row[1]:
                #Some entries have nothing in the releasedate field (row[5]) and have \\N instead.
                #So we ensure that we skip elements that are equal to \\N
                if row[6] != '\\N':
                    #Due to a VERY big dataset, we only want to count data after the year 2000
                    if int(row[5]) > 2000:
                       enddates.append(row[6])
        #If check is false, this else is run, as it is the first element in out data set
        #which does not contain relevant data, so we set check to true 
        else:
            check = True

#Create counter elements from our lists (counters are lists containing frequencies of the data in our lists)
rFreq = Counter(releasedates)
eFreq = Counter(enddates)
gFreq = Counter(genres)

print ('Most common releasedates for movies:')
#Prints the 5 most common releasedates
for letter, count in rFreq.most_common(5):
    print ('%s: %7d' % (letter, count))

print ('Most common enddates for series:')
#Prints the 5 most common enddates
for letter, count in eFreq.most_common(5):
    print ('%s: %7d' % (letter, count))

print ('Most common genres for movies:')
#Prints the 5 most common genres
for letter, count in gFreq.most_common(5):
    print ('%s: %7d' % (letter, count))
    
    
print("---------------------------------------------------")
    
#print('What is the average runtime on adult films? \n'')

# setting variables 
filename = 'title.basics.tsv'
amound_of_adultmovies = 0
total_min_adultmovies = 0

# defining function that takes a parameter file, the name of the file.
# opens the file and yields one line of the time.
def get_data(file):
    with open(file) as tsvfile:
        #reader = csv.reader(tsvfile, delimiter='\t')
        for line in tsvfile:
            yield line

# makes a for-loop on the file using the function that yields a line.
# splits the line by tabs and asks if the 4th tab has a 1.
# splits the line by tabs and asks if the 7th tab Does NOT have a N.
# global and then the varibles, so we can chance them.
# plus the numbers from the lines and saves them in the global variables.
for line in get_data(filename):
    if '1' in line.split('\t')[4]:
        if 'N' not in line.split('\t')[7]:
            global total_min_adultmovies
            global amound_of_adultmovies
            total_min_adultmovies += int(line.split('\t')[7])
            amound_of_adultmovies += int(line.split('\t')[4])
            
# prints the calculation
print(f'amound_of_adultmovies = {amound_of_adultmovies}')
print(f'total_min_adultmovies = {total_min_adultmovies}\n')
print(f'{total_min_adultmovies} / {amound_of_adultmovies} = ', (total_min_adultmovies/amound_of_adultmovies))

Downloading file to ./title.basics.tsv.gz
Most common releasedates for movies:
2016:   60578
2017:   59168
2014:   58500
2015:   57264
2013:   54217
Most common enddates for series:
2017:    2672
2016:    2007
2015:    1591
2014:    1393
2013:    1381
Most common genres for movies:
Drama,Short:   84271
Short:   56334
Comedy,Short:   50241
Documentary:   43275
Drama:   30782
---------------------------------------------------
amound_of_adultmovies = 56027
total_min_adultmovies = 5870587

5870587 / 56027 =  104.78139111499813
