# Pandas Tutorial

This is a `pandas` tutorial so I am able to properly analyze data in Python. I am doing this so I can analyze data and move onto machine learning projects.

In [1]:
import pandas as pd

In [2]:
# A Series is a two column table with a values column and an indecies column
series = pd.Series([0.4,0.3,0.2,0.1], # Values
                   ['A', 'B', 'C', 'D']) # Indecies
series

A    0.4
B    0.3
C    0.2
D    0.1
dtype: float64

In [3]:
series['A'] # Use the indecies to obtain the value or vice versa

0.4

In [4]:
# Pandas Data Frame

# The data in this case will be stored in a dictionary at first
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 24, 35, 32]}

df = pd.DataFrame(data) # The .DataFrame() object converts the data dictionary into a df
df.head() # Displays first 10 rows of data but can specify the amount as an argument.

Unnamed: 0,Name,Age
0,John,28
1,Anna,24
2,Peter,35
3,Linda,32


In [8]:
print(df.describe()) # Usualy displays the first 10 rows of data.
print(df.tail()) # Usually displays the last 10 rows of data.

             Age
count   4.000000
mean   29.750000
std     4.787136
min    24.000000
25%    27.000000
50%    30.000000
75%    32.750000
max    35.000000
    Name  Age
0   John   28
1   Anna   24
2  Peter   35
3  Linda   32


In [6]:
'''
Essential operations for reading data are as follows

df = pd.read_csv('file.csv')

df = pd.read_excel('file.xlsx')
'''

"\nEssential operations for reading data are as follows\n\ndf = pd.read_csv('file.csv')\n\ndf = pd.read_excel('file.xlsx')\n"

In [10]:
# Selection and Indexing

# Selecting a singe column is as follows

print(df['Name'])

# Selecting multiple columns is as follows.

print(df[['Name', 'Age']])

0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object
    Name  Age
0   John   28
1   Anna   24
2  Peter   35
3  Linda   32


In [11]:
print(df.iloc[0])

# the .iloc attribute selects the row of data by it's integer location.

Name    John
Age       28
Name: 0, dtype: object


In [None]:
df.loc[0]

# df.loc[0] selects the first row by its label(index)

In [13]:
# Filtering 

filtered_df = df[df['Age'] > 30]
filtered_df.head()

# 'df[df['Age]> 30] filters the DataFrame to include ages that are greater than 30

Unnamed: 0,Name,Age
2,Peter,35
3,Linda,32


In [14]:
# Assigning new columns

df['Age * 2'] = df['Age'] *2

df

# creating a new columnn in a df is just like assigning a value to a variable

Unnamed: 0,Name,Age,Age * 2
0,John,28,56
1,Anna,24,48
2,Peter,35,70
3,Linda,32,64


In [15]:
# Data Cleaning and Prep

df.isnull().sum()

# this returns the number of missing values in each column

Name       0
Age        0
Age * 2    0
dtype: int64

In [22]:
df.dropna(inplace=True)

# The .dropna() method removes all rows with missing values from the df
# inplace=True is an arguemnt which modifies the df in place

In [21]:
df.fillna(value = 0, inplace = True)

# df.fillna(value = 0, inplace=True) fills all the missing values in the df with 0.

In [19]:
df.rename(columns={'OldName': 'NewName'}, inplace=True)

# the .rename() method allows for the columns to be renamed

In [20]:
df.head()

Unnamed: 0,Name,Age,Age * 2
0,John,28,56
1,Anna,24,48
2,Peter,35,70
3,Linda,32,64


In [23]:
df['Age'] = df['Age'].astype(int)