# Pandas Tutorial

### Importing libraries

Use *pip install pandas* or *conda install pandas*

In [None]:
!pip install pandas
!pip install numpy
!pip install matplotlib

In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Basic Syntax

#### Series

Series are essentially a column in a table. It can have a title and custom indices

In [101]:
series = pd.Series([20,21,300],name = 'numbers')
series

It can be spliced like a list in python

In [102]:
series[0]

In [103]:
series[0:2] #this will create another series

#### The dataframe

In [104]:
data = {"a":[1,2,3], "b":[4,5,6], "c":[7,8,9]} # this is known as a dictionary
df = pd.DataFrame(data)
df

In [105]:
col_df = pd.concat([series,df],axis=1) #axis = 1 means concatenate along the columns
col_df

In [106]:
new_row = pd.DataFrame([[13,14,15]],columns = ['a','b','c'])
new_row

In [107]:
row_df = pd.concat([df,new_row],axis=0) #axis = 0 means concatinate along the rows
row_df

In [108]:
row_df.reset_index(drop=True,inplace=True) #drop=True means drop the old index
row_df

In [109]:
names = ['Mungo','Greg','Balthazar']
col_df['Names'] = names
col_df

In [110]:
col_df.drop(['a','b','c'],axis=1,inplace=True) #axis = 1 means drop the column
col_df

In [111]:
col_df.rename(columns={'numbers':'Age'},inplace=True)
col_df

*Exercise 1: Add your name and age to col_df as a new row*

*Exercise 2: Add a new column called 'Surname', with a made up surname for each character*

#### External data

Upload the real_estate.csv file under files->upload file and run the code below

In [None]:
real_estate = pd.read_csv('/content/real_estate.csv')

In [115]:
real_estate.info()

In [116]:
real_estate.isna().sum() #returns the number of null values in each column

In [117]:
real_estate.head() #returns the first 5 rows

In [118]:
real_estate[['Serial Number','List Year']].head() #columns can be accessed by name

In [119]:
real_estate.iloc[0:,0:2].head() #or using iloc

In [120]:
real_estate.iloc[0,0] #iloc is used to select rows and columns by their index location

In [121]:
real_estate[real_estate['List Year']==2020].head() #selecting rows based on a condition

In [122]:
real_estate['Town'][real_estate['List Year']==2020].head() #selecting a column based on a condition

*Exercise 3: Show the address of all commercial properties*

*Exercise 4: Show all data for properties that are in Avon*

*Exercise 5: Show the first 10 rows of the sales ratio and the address in one dataframe*

In [126]:
real_estate['Property Type'].unique() #returns the unique values in a column

In [127]:
real_estate['Property Type'].value_counts() #returns the number of times each unique value appears in a column

In [128]:
real_estate[real_estate['Non Use Code'].isna()].head() #selecting rows where a column is null

In [129]:
real_estate[~real_estate['Non Use Code'].isna()].head() #selecting rows where a column is not null

In [130]:
real_estate['List Year'][~(real_estate['List Year']==2019)].unique() #selecting rows where a column is not equal to a value
#REMEMBER BRACKETS

In [131]:
real_estate['List Year'].isin([2020])

In [132]:
real_estate[real_estate['List Year'].isin([2020])] #selecting rows where a column is equal to any value in a list

*Exercise 6: What will the following code return:*

real_estate[real_estate['List Year']==2020]

*Exercise 7: How many addresses are null?*

*Exercise 8: What will the following code return:*

real_estate['Property Type'][real_estate['Residential Type']=='Two Family'].unique()

*Exercise 9: What will the following code return:*

real_estate[real_estate['Residential Type']=='Single Family'][real_estate['Property Type'].isin(twofam)]['Property Type'].unique()

*Exercise 10: Find how many unique towns are in the dataset*

*Exercise 11: Find how many towns have a condo for sale and not a four family house (hint: revise how to use ".isin()")*

In [140]:
real_estate['List Year'].info()

In [141]:
real_estate['List Year'] = real_estate['List Year'].astype('str') #changing the data type of a column
real_estate['List Year'].info()

In [142]:
string_list = pd.Series(["Hello","World","!"])
string_list

In [143]:
string_list.str.replace("l","w")

In [144]:
numbers = pd.Series([1,2,3,4,5,6,7,8,9,10])
numbers.apply(np.sqrt)

In [145]:
numbers.apply(lambda x: max(x,5))

#### Visualisation

In [146]:
#creating bar charts
plt.figure(figsize=(11,5))
plt.bar(real_estate['List Year'].sort_values().unique().astype(str),real_estate['List Year'].value_counts())
plt.xlabel('List Year')
plt.ylabel('Number of Properties')
plt.title('Number of Properties Listed per Year')

#sort_values() sorts the years in ascending order
#astype(str) converts the years to strings
#unique() returns the unique years
#value_counts() counts the number of times each year appears in the series

In [147]:
#plotting two scatterplots side by side
plt.figure(figsize=(11,5))
plt.subplot(1,2,1)

plt.scatter(real_estate['Assessed Value'],real_estate['Sale Amount'])
plt.xlabel('Assessed Value')
plt.ylabel('Sale Amount')
plt.title('Sale Amount vs Assessed Value')

plt.subplot(1,2,2)

#remove outliers
assessed = real_estate['Assessed Value'][(real_estate['Sale Amount']<4000000000)&(real_estate['Assessed Value']<800000000)]
sale = real_estate['Sale Amount'][(real_estate['Sale Amount']<4000000000)&(real_estate['Assessed Value']<800000000)]

plt.scatter(assessed,sale)
plt.xlabel('Assessed Value')
plt.ylabel('Sale Amount')
plt.title('Sale Amount vs Assessed Value')

In [148]:
plt.figure(figsize=(10,5))
for property in real_estate['Property Type'].unique():
    plt.scatter(assessed[real_estate['Property Type']==property],sale[real_estate['Property Type']==property],label=property)
plt.xlabel('Assessed Value')
plt.ylabel('Sale Amount')
plt.title('Sale Amount vs Assessed Value')
plt.legend()

In [None]:
plt.figure(figsize=(5,5))
plt.hist(real_estate['Sales Ratio'][real_estate['Sales Ratio']<0.5],bins=100)
plt.xlabel('Sales Ratio')
plt.ylabel('Number of Properties')
plt.title('Sales Ratio Distribution')