# Pandas Tutorial

This is a `pandas` tutorial so I am able to properly analyze data in Python. I am doing this so I can analyze data and move onto machine learning projects.

In [29]:
import pandas as pd

In [30]:
# A Series is a two column table with a values column and an indecies column
series = pd.Series([0.4,0.3,0.2,0.1], # Values
                   ['A', 'B', 'C', 'D']) # Indecies
series

A    0.4
B    0.3
C    0.2
D    0.1
dtype: float64

In [31]:
series['A'] # Use the indecies to obtain the value or vice versa

0.4

In [32]:
# Pandas Data Frame

# The data in this case will be stored in a dictionary at first
data = {'Name': ['John', 'Anna', 'Peter', 'Linda'],
        'Age': [28, 24, 35, 32]}

df = pd.DataFrame(data) # The .DataFrame() object converts the data dictionary into a df
df.head() # Displays first 10 rows of data but can specify the amount as an argument.

Unnamed: 0,Name,Age
0,John,28
1,Anna,24
2,Peter,35
3,Linda,32


In [33]:
print(df.describe()) # Usualy displays the first 10 rows of data.
print(df.tail()) # Usually displays the last 10 rows of data.

             Age
count   4.000000
mean   29.750000
std     4.787136
min    24.000000
25%    27.000000
50%    30.000000
75%    32.750000
max    35.000000
    Name  Age
0   John   28
1   Anna   24
2  Peter   35
3  Linda   32


In [34]:
'''
Essential operations for reading data are as follows

df = pd.read_csv('file.csv')

df = pd.read_excel('file.xlsx')
'''

"\nEssential operations for reading data are as follows\n\ndf = pd.read_csv('file.csv')\n\ndf = pd.read_excel('file.xlsx')\n"

In [35]:
# Selection and Indexing

# Selecting a singe column is as follows

print(df['Name'])

# Selecting multiple columns is as follows.

print(df[['Name', 'Age']])

0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object
    Name  Age
0   John   28
1   Anna   24
2  Peter   35
3  Linda   32


In [36]:
print(df.iloc[0])

# the .iloc attribute selects the row of data by it's integer location.

Name    John
Age       28
Name: 0, dtype: object


In [37]:
df.loc[0]

# df.loc[0] selects the first row by its label(index)

Name    John
Age       28
Name: 0, dtype: object

In [38]:
# Filtering 

filtered_df = df[df['Age'] > 30]
filtered_df.head()

# 'df[df['Age]> 30] filters the DataFrame to include ages that are greater than 30

Unnamed: 0,Name,Age
2,Peter,35
3,Linda,32


In [39]:
# Assigning new columns

df['Age * 2'] = df['Age'] *2

df

# creating a new columnn in a df is just like assigning a value to a variable

Unnamed: 0,Name,Age,Age * 2
0,John,28,56
1,Anna,24,48
2,Peter,35,70
3,Linda,32,64


In [40]:
# Data Cleaning and Prep

df.isnull().sum()

# this returns the number of missing values in each column

Name       0
Age        0
Age * 2    0
dtype: int64

In [41]:
df.dropna(inplace=True)

# The .dropna() method removes all rows with missing values from the df
# inplace=True is an arguemnt which modifies the df in place

In [42]:
df.fillna(value = 0, inplace = True)

# df.fillna(value = 0, inplace=True) fills all the missing values in the df with 0.

In [43]:
df.rename(columns={'OldName': 'NewName'}, inplace=True)

# the .rename() method allows for the columns to be renamed

In [26]:
df.head()

Unnamed: 0,Name,Age,Age * 2
0,John,28,56
1,Anna,24,48
2,Peter,35,70
3,Linda,32,64


In [44]:
df['Age'] = df['Age'].astype(int)

# This changes the Data Type of the 'Ages' Column in the df

In [53]:
#Grouping Data


# Sample Data
data = {
    'Age': [20,21,20,21,20],
    'Score': [85,90,78,88,92],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve']
}

example_df = pd.DataFrame(data) # Turing data into a pandas dataframe

numeric_df = example_df.select_dtypes(include='number') # Only selects numeric columns

grouped_df = numeric_df.groupby('Age').mean() # Group by 'Age' and calculates the mean

grouped_df

Unnamed: 0_level_0,Score
Age,Unnamed: 1_level_1
20,85.0
21,89.0


In [52]:
# Aggregating Data

data = {
    'Age': [20,21,20,21,20],
    'Score': [85,90,78,88,92],
    'Height': [160,170,158,175,162]
}

new_df = pd.DataFrame(data)

#aggregating data
aggregated_df = new_df.groupby('Age').agg({'Score': 'sum',
                                           'Height': 'mean'})

aggregated_df

#Groups df by the 'Age' column and applies different aggregation functions to Columns 1&2

Unnamed: 0_level_0,Score,Height
Age,Unnamed: 1_level_1,Unnamed: 2_level_1
20,255,160.0
21,178,172.5


In [54]:
# Merging DataFrames

df1 = pd.DataFrame({'key': ['A', 'B', 'C'],
                    'value': [1,2,3]})
df2 = pd.DataFrame({'key': ['D', 'E', 'F'],
                    'value': [4,5,6]})
merged_df = pd.merge(df1, df2, on='key')

merged_df

Unnamed: 0,key,value_x,value_y


In [55]:
# Joining

df1.set_index('key', inplace=True)
df2.set_index('key', inplace=True)
joined_df = df1.join(df2, lsuffix='_left', rsuffix='_right')
joined_df

Unnamed: 0_level_0,value_left,value_right
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1,
B,2,
C,3,


In [58]:
# Time Series

data = {'Date': ['2021-01-01', '2021-02-01', '2021-03-01'],
        'Value': [10,20,30]}
time_df = pd.DataFrame(data)

time_df['Date'] = pd.to_datetime(time_df['Date']) # Converts "Date" column to datetime.
time_df.set_index('Date', inplace=True) # Sets 'Date' as the index.
monthly_mean = time_df.resample('ME').mean() # Resample the DataFrame by month and computes the Mean

monthly_mean

Unnamed: 0_level_0,Value
Date,Unnamed: 1_level_1
2021-01-31,10.0
2021-02-28,20.0
2021-03-31,30.0
