In [None]:
import pandas as pd

#### Create a series

In [None]:
name = pd.Series(['Adam','Bob','Dave','Emily'])

In [None]:
name

In [None]:
name.index

In [None]:
name.values

In [None]:
name

In [None]:
name[0]

In [None]:
age = pd.Series([15,16,16,15])

In [None]:
age

In [None]:
age.index = name

In [None]:
age.index

In [None]:
age

In [None]:
age['Adam']

#### Create a dataframe

In [None]:
name = pd.Series(['Adam','Bob','Dave','Emily'])
age = pd.Series([15,16,16,15])
gender = pd.Series(['M','M','M','F'])
students = pd.DataFrame({'Name':name, 'Age':age, 'Gender':gender})

In [None]:
students

In [None]:
students = pd.DataFrame({'Name':['Adam','Bob','Dave','Emily'], 
                         'Age':[15,16,16,15], 
                         'Gender':['M','M','M','F']})

In [None]:
students

#### Index

In [None]:
students.index = students['Name']

In [None]:
students

In [None]:
students.set_index('Name', inplace=True)

In [None]:
students

#### Indexing

In [None]:
students['Age']

In [None]:
students.loc['Adam']

In [None]:
students.iloc[0]

In [None]:
students.loc['Adam','Age']

In [None]:
students[['Age','Gender']]

In [None]:
students.loc[['Adam','Bob']]

In [None]:
students.iloc[0:2]

In [None]:
students.loc[['Adam','Bob'],['Gender','Age']]

#### Adding rows

In [None]:
students.loc['Fred'] = [16, 'M']

In [None]:
students

In [None]:
students2 = pd.DataFrame({'Age':[16,15], 
                         'Gender':['F','M']}, index=['Grace', 'Henry'])
students2

In [None]:
students = students.append(students2)

In [None]:
students

#### Add a column

In [None]:
students['Height'] = [170, 167, 172, 163, 165, 158, 175]

In [None]:
students

#### Remove rows or columns

In [None]:
students.drop(['Bob','Grace'])

In [None]:
students.drop(['Age','Height'], axis=1)

## Analyzing IMDb Movies Data

In [None]:
movies_df = pd.read_csv("IMDB-Movie-Data.csv")

In [None]:
movies_df.head()

In [None]:
movies_df = movies_df.set_index("Title")
movies_df.head(2)

In [None]:
movies_df = movies_df.reset_index()
movies_df.head(2)

In [None]:
movies_df = pd.read_csv("IMDB-Movie-Data.csv", index_col='Title')

In [None]:
movies_df.head()

In [None]:
movies_df.shape

In [None]:
movies_df.info()

#### Rename columns

In [None]:
movies_df.columns

In [None]:
movies_df.rename(columns={
        'Runtime (Minutes)': 'Runtime', 
        'Revenue (Millions)': 'Revenue'
    }, inplace=True)

In [None]:
movies_df.columns

#### Select columns

In [None]:
type(movies_df['Genre'])

In [None]:
movies_df['Genre']

In [None]:
type(movies_df[['Genre']])

In [None]:
type(movies_df[['Genre','Rating']])

In [None]:
movies_df[['Genre','Rating']]

#### Select rows

In [None]:
type(movies_df.loc["Prometheus"])

In [None]:
movies_df.loc["Prometheus"]

In [None]:
type(movies_df.loc[["Prometheus"]])

In [None]:
type(movies_df.loc[["Prometheus","Suicide Squad"]])

In [None]:
movies_df.loc[["Prometheus","Suicide Squad"]]

#### Select rows and columns

In [None]:
movies_df.loc[["Prometheus","Suicide Squad"],["Genre","Rating"]]

In [None]:
movies_df.iloc[[1,4],[1,7]]

#### Missing data

In [None]:
movies_df.isnull()

In [None]:
movies_df.isnull().sum()

In [None]:
movies_df.fillna(0).isnull().sum()

In [None]:
movies_df.fillna(0).shape

In [None]:
movies_df2 = movies_df.dropna()

In [None]:
movies_df2.shape

In [None]:
movies_df.dropna(inplace=True)

In [None]:
movies_df.shape

#### Statistics functions

In [None]:
movies_df.mean()

In [None]:
movies_df.quantile(0.25)

In [None]:
movies_df.describe()

In [None]:
movies_df[["Genre","Description","Director","Actors"]].describe()

In [None]:
movies_df.select_dtypes("object").describe()

#### Conditional selection

In [None]:
movies_df['Director']=="Ridley Scott"

In [None]:
movies_df[movies_df['Director']=="Ridley Scott"]

In [None]:
movies_df[movies_df['Director'].isin(['Christopher Nolan', 'Ridley Scott'])].head()

In [None]:
movies_df[movies_df['Rating'] >= 9]

In [None]:
movies_df[
    ((movies_df['Year'] >= 2005) & (movies_df['Year'] <= 2010))
    & (movies_df['Rating'] > 8.0)
    & (movies_df['Revenue'] < movies_df['Revenue'].quantile(0.25))
]

#### apply()

In [None]:
def rating_function(x):
    if x >= 8.0:
        return "good"
    else:
        return "bad"

In [None]:
movies_df["Rating"].apply(rating_function)

In [None]:
movies_df["Rating_category"] = movies_df["Rating"].apply(rating_function)

In [None]:
movies_df.head()

In [None]:
movies_df["Rating_category"] = movies_df["Rating"].apply(lambda x: 'good' if x >= 8.0 else 'bad')
movies_df.head()

#### apply() with a lambda function

In [None]:
movies_df["Actors"] = movies_df["Actors"].apply(lambda x: x.split(sep=','))

In [None]:
movies_df.head()

In [None]:
movies_df[movies_df['Actors'].apply(lambda x: "Chris Pratt" in x)]

#### sort

In [None]:
movies_df.sort_values(by="Title").head()

In [None]:
movies_df.sort_values(by=["Year","Revenue"], ascending=False).head()

#### Group by

In [None]:
groups = movies_df.reset_index().groupby("Year")
type(groups)

In [None]:
groups.size()

In [None]:
groups["Revenue"].describe()

In [None]:
max_revenue_index = groups[["Revenue"]].idxmax()["Revenue"]
max_revenue_index

In [None]:
movies_df.iloc[max_revenue_index,]["Year"]

#### Plot

In [None]:
import matplotlib.pyplot as plt

In [None]:
movies_df.plot(kind='scatter', x='Rating', y='Revenue', title='Revenue vs Rating');

In [None]:
movies_df['Rating'].plot(kind='hist', title='Rating');