#

# Import Pandas

In [1]:
import pandas as pd

# Import Data

In [2]:
names = ["id", "title", "year", "rating", "votes", "length", "genres"] # Column names
data = pd.read_csv("imdb_top_10000.txt", sep="\t", names=names, index_col=0)

Seperator is tab, so we use sep="\t" 

The names argument is the list of column names

The index_col argument is the column to use as the column labels of the data

# Explore Data

In [3]:
data.head() # Returns the first n rows of the data, default n=5

Unnamed: 0_level_0,title,year,rating,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption (1994),1994,9.2,619479,142 mins.,Crime|Drama
tt0110912,Pulp Fiction (1994),1994,9.0,490065,154 mins.,Crime|Thriller
tt0137523,Fight Club (1999),1999,8.8,458173,139 mins.,Drama|Mystery|Thriller
tt0133093,The Matrix (1999),1999,8.7,448114,136 mins.,Action|Adventure|Sci-Fi
tt1375666,Inception (2010),2010,8.9,385149,148 mins.,Action|Adventure|Sci-Fi|Thriller


In [4]:
data.tail() # Returns the last n rows of the data, default n=5

Unnamed: 0_level_0,title,year,rating,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0807721,Meduzot (2007),2007,7.0,1357,78 mins.,Drama
tt0339642,Daltry Calhoun (2005),2005,5.2,1357,100 mins.,Comedy|Drama|Music|Romance
tt0060880,The Quiller Memorandum (1966),1966,6.5,1356,104 mins.,Drama|Mystery|Thriller
tt0152836,Taal (1999),1999,6.5,1356,179 mins.,Musical|Romance
tt0279977,The Navigators (2001),2001,6.9,1356,96 mins.,Comedy|Drama


In [5]:
data.info() # Returns a table with the data types and number of non-null values in each column

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, tt0111161 to tt0279977
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   10000 non-null  object 
 1   year    10000 non-null  int64  
 2   rating  10000 non-null  float64
 3   votes   10000 non-null  int64  
 4   length  10000 non-null  object 
 5   genres  9999 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 546.9+ KB


In [6]:
data.describe() # Returns a summary of the data including averages and quartiles

Unnamed: 0,year,rating,votes
count,10000.0,10000.0,10000.0
mean,1993.4728,6.38607,16604.0128
std,14.829924,1.189933,34563.459698
min,1950.0,1.5,1356.0
25%,1986.0,5.7,2333.75
50%,1998.0,6.6,4980.5
75%,2005.0,7.2,15277.75
max,2011.0,9.2,619479.0


# Export Data

In [7]:
data.to_csv("test.csv", header=True, index=True, sep=";") 

Saves the data to a csv file with the name "data.csv"

The argument header and index are set to True to include the column names and the index column

The argument sep is set to ";" to use a semicolon as the seperator/delimiter

# Sort Data

In [8]:
data.sort_values(by="rating") # Sorts the data in ascending order by the specified column, here it is rating

Unnamed: 0_level_0,title,year,rating,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0270846,Superbabies: Baby Geniuses 2 (2004),2004,1.5,13196,88 mins.,Comedy|Family
tt0059464,Monster a-Go Go (1965),1965,1.5,3255,70 mins.,Sci-Fi|Horror
tt0364986,Ben & Arthur (2002),2002,1.5,4675,85 mins.,Drama|Romance
tt0421051,Daniel the Wizard (2004),2004,1.5,8271,81 mins.,Comedy|Crime|Family|Fantasy|Horror
tt1309000,Dream Well (2009),2009,1.5,2848,00 mins.,Comedy|Romance|Sport
...,...,...,...,...,...,...
tt0071562,The Godfather: Part II (1974),1974,9.0,291169,200 mins.,Crime|Drama
tt0060196,"The Good, the Bad and the Ugly (1966)",1966,9.0,195238,161 mins.,Western
tt0110912,Pulp Fiction (1994),1994,9.0,490065,154 mins.,Crime|Thriller
tt0068646,The Godfather (1972),1972,9.2,474189,175 mins.,Crime|Drama


In [9]:
data.sort_values(by="rating", ascending=False) # Sorts the data in descending order by the specified column

Unnamed: 0_level_0,title,year,rating,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0111161,The Shawshank Redemption (1994),1994,9.2,619479,142 mins.,Crime|Drama
tt0068646,The Godfather (1972),1972,9.2,474189,175 mins.,Crime|Drama
tt0060196,"The Good, the Bad and the Ugly (1966)",1966,9.0,195238,161 mins.,Western
tt0110912,Pulp Fiction (1994),1994,9.0,490065,154 mins.,Crime|Thriller
tt0252487,Outrageous Class (1975),1975,9.0,9823,87 mins.,Comedy|Drama
...,...,...,...,...,...,...
tt0364986,Ben & Arthur (2002),2002,1.5,4675,85 mins.,Drama|Romance
tt0060753,Night Train to Mundo Fine (1966),1966,1.5,3542,89 mins.,Action|Adventure|Crime|War
tt0421051,Daniel the Wizard (2004),2004,1.5,8271,81 mins.,Comedy|Crime|Family|Fantasy|Horror
tt0059464,Monster a-Go Go (1965),1965,1.5,3255,70 mins.,Sci-Fi|Horror


In [10]:
data.sort_values(by=["rating", "year"]) # Sorts the data by both rating and year

Unnamed: 0_level_0,title,year,rating,votes,length,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0059464,Monster a-Go Go (1965),1965,1.5,3255,70 mins.,Sci-Fi|Horror
tt0060666,Manos: The Hands of Fate (1966),1966,1.5,20927,74 mins.,Horror
tt0060753,Night Train to Mundo Fine (1966),1966,1.5,3542,89 mins.,Action|Adventure|Crime|War
tt0364986,Ben & Arthur (2002),2002,1.5,4675,85 mins.,Drama|Romance
tt0270846,Superbabies: Baby Geniuses 2 (2004),2004,1.5,13196,88 mins.,Comedy|Family
...,...,...,...,...,...,...
tt0071562,The Godfather: Part II (1974),1974,9.0,291169,200 mins.,Crime|Drama
tt0252487,Outrageous Class (1975),1975,9.0,9823,87 mins.,Comedy|Drama
tt0110912,Pulp Fiction (1994),1994,9.0,490065,154 mins.,Crime|Thriller
tt0068646,The Godfather (1972),1972,9.2,474189,175 mins.,Crime|Drama


# Create Data Frame

In [11]:
sample_data = {
    "tv": [230, 44, 17],
    "radio": [37, 39, 45],
    "news": [69, 45, 69],
    "sales": [22, 10, 9],
}

In [12]:
data2 = pd.DataFrame(sample_data) # Creates a DataFrame from a dictionary

In [13]:
data2

Unnamed: 0,tv,radio,news,sales
0,230,37,69,22
1,44,39,45,10
2,17,45,69,9


# Select Data

In [15]:
data["title"] # Returns the title column and all of its values

id
tt0111161    The Shawshank Redemption (1994)
tt0110912                Pulp Fiction (1994)
tt0137523                  Fight Club (1999)
tt0133093                  The Matrix (1999)
tt1375666                   Inception (2010)
                          ...               
tt0807721                     Meduzot (2007)
tt0339642              Daltry Calhoun (2005)
tt0060880      The Quiller Memorandum (1966)
tt0152836                        Taal (1999)
tt0279977              The Navigators (2001)
Name: title, Length: 10000, dtype: object

In [17]:
data[["title", "rating"]] # Returns the title and rating columns and all of their values

Unnamed: 0_level_0,title,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0111161,The Shawshank Redemption (1994),9.2
tt0110912,Pulp Fiction (1994),9.0
tt0137523,Fight Club (1999),8.8
tt0133093,The Matrix (1999),8.7
tt1375666,Inception (2010),8.9
...,...,...
tt0807721,Meduzot (2007),7.0
tt0339642,Daltry Calhoun (2005),5.2
tt0060880,The Quiller Memorandum (1966),6.5
tt0152836,Taal (1999),6.5


In [26]:
data["rating"].mean() # Returns the mean of the rating column

6.38607

In [27]:
data["rating"].median() # Returns the median of the rating column

6.6

In [30]:
{"Max": data["rating"].max(), "Min": data["rating"].min()} # Returns the max and min of the rating column

{'Max': 9.2, 'Min': 1.5}

In [32]:
data["genres"].unique() # Returns all unique values in the genres column

array(['Crime|Drama', 'Crime|Thriller', 'Drama|Mystery|Thriller', ...,
       'Drama|War|Adventure|Romance', 'Western|Sci-Fi|Thriller',
       'Adventure|Comedy|Drama|War'], dtype=object)

In [33]:
data["rating"].value_counts() # Returns the number of times each value appears in the rating column

7.1    401
6.8    401
7.2    386
6.7    384
7.0    382
      ... 
1.5      7
1.6      6
8.9      5
9.0      4
9.2      2
Name: rating, Length: 77, dtype: int64

In [38]:
data["rating"].value_counts().sort_index() # Returns the number of times each value appears in the rating column, sorted by index

1.5     7
1.6     6
1.7    12
1.8    12
1.9     9
       ..
8.7    13
8.8     9
8.9     5
9.0     4
9.2     2
Name: rating, Length: 77, dtype: int64