# 1. Python with Pandas

## (i) Loading and saving data with pandas

In [1]:
import pandas as pd
import numpy as np

workbook module from open py excel-  save to the excel sheet

In [2]:
from openpyxl.workbook import Workbook

In [3]:
df_excel = pd.read_excel('regions.xlsx')

In [4]:
df_csv = pd.read_csv('Names.csv', header = None)
# first row of data is acting as a header for the rest of the file -  to prevent this- specify that there is no header present in the CSV file

In [5]:
df_txt = pd.read_csv('data.txt', delimiter='\t')
# add a delimiter to specify that we're looking to separate by tabs

In [6]:
# print(df_excel)
# print(df_txt)
# print(df_csv)

In [7]:
df_csv.columns = ['First', 'Last', 'Address', 'City', 'State', 'Area Code', 'Income']

In [8]:
df_csv.to_excel('modified.xlsx')

## (ii) Viewing and inspecting data with pandas

In [9]:
# we need to have some data stored into our data frame

df = pd.read_csv('Names.csv', header=None)
df.columns = ['First', 'Last', 'Address', 'City', 'State', 'Area Code', 'Other']
print(df.columns)

Index(['First', 'Last', 'Address', 'City', 'State', 'Area Code', 'Other'], dtype='object')


#### Accessing single column


In [10]:
print(df['Last'])

0         Doe
1    McGinnis
2      Repici
3       Tyler
4    Blankman
5         Jet
Name: Last, dtype: object


#### Accessing multiple columns


In [11]:
#double brackets [[   ]] - becoz the index of the data frame is presented as a list of columns

print(df[['State', 'Area Code']])

  State  Area Code
0    NJ       8074
1    PA       9119
2    NJ       8075
3    SD      91234
4    SD        298
5    CO        123


#### Slicing 
First Name column and  only want the first three lines

In [12]:
print(df['First'][0:3])

0             John
1             Jack
2    John "Da Man"
Name: First, dtype: object


### pandas.DataFrame.iloc
- integer-location based indexing for selection by position

In [13]:
# choose what you want to view from this entire dataframe
print(df)

                 First      Last                           Address  \
0                 John       Doe                 120 jefferson st.   
1                 Jack  McGinnis                      220 hobo Av.   
2        John "Da Man"    Repici                 120 Jefferson St.   
3              Stephen     Tyler  7452 Terrace "At the Plaza" road   
4                  NaN  Blankman                               NaN   
5  Joan "Danger", Anne       Jet               9th, at Terrace plc   

          City State  Area Code   Other  
0    Riverside    NJ       8074   45000  
1        Phila    PA       9119   18000  
2    Riverside    NJ       8075  120000  
3     SomeTown    SD      91234   90000  
4     SomeTown    SD        298   30000  
5  Desert City    CO        123   68000  


In [14]:
# choosing index 1 

print(df.iloc[1])

First                Jack
Last             McGinnis
Address      220 hobo Av.
City                Phila
State                  PA
Area Code            9119
Other               18000
Name: 1, dtype: object


In [15]:
# choosing one single element "Repici" - which is located in [2,1]

print(df.iloc[2,1])

Repici


##### Checking certain rows/columns or specific location of an Excel file consistently

- First Name, Last Name and State -  extract from  data frame and save to an Excel document with  Save function

In [16]:
wanted_value = df[['First', 'Last', 'State']]
stored = wanted_value.to_excel('State_Location.xlsx', index = None)
# check folder

## (iii) Filter and Sort with Pandas

### pandas.DataFrame.loc
- Access a group of rows and columns by label(s) or a boolean array

In [17]:
#  Riverside Residents

print(df.loc[df['City']=='Riverside'])

           First    Last            Address       City State  Area Code  \
0           John     Doe  120 jefferson st.  Riverside    NJ       8074   
2  John "Da Man"  Repici  120 Jefferson St.  Riverside    NJ       8075   

    Other  
0   45000  
2  120000  


In [18]:
# Riverside Residents with name John

print (df.loc[(df['City'] == 'Riverside') & (df['First'] == 'John')])

# pandas use & instead of and
# John "Da Man" is not returned - pandas is not recognizing his first name to be John, but rather including his nickname in quotations. 
# clean data is a must

  First Last            Address       City State  Area Code  Other
0  John  Doe  120 jefferson st.  Riverside    NJ       8074  45000


##### lambda function

lambda is an anonymous function - function that is defined without a name
-  normal functions are defined using the def
- anonymous functions are defined using the lambda 

syntax 
- lambda arguments: expression

In [19]:
# Program to show the use of lambda functions
double = lambda x: x * 2
print(double(5))

10


### pandas.DataFrame.apply

- apply a function along an axis of the DataFrame


In [20]:
# tax percentage column - based on the income

df_csv = pd.read_csv('Names.csv', header = None)
df.columns = ['First', 'Last', 'Address', 'City', 'State', 'Area Code', 'Income']

# tax % is related to income - apply and lambda is used for comparison of the two
# lambda X as an element of the income column
#after colon is a simple conditional statement

df['Tax %'] = df ['Income'].apply(lambda x: .15 if 10000 < x < 40000 else .2 if 40000 < x < 80000 else .25)
print(df)

                 First      Last                           Address  \
0                 John       Doe                 120 jefferson st.   
1                 Jack  McGinnis                      220 hobo Av.   
2        John "Da Man"    Repici                 120 Jefferson St.   
3              Stephen     Tyler  7452 Terrace "At the Plaza" road   
4                  NaN  Blankman                               NaN   
5  Joan "Danger", Anne       Jet               9th, at Terrace plc   

          City State  Area Code  Income  Tax %  
0    Riverside    NJ       8074   45000   0.20  
1        Phila    PA       9119   18000   0.15  
2    Riverside    NJ       8075  120000   0.25  
3     SomeTown    SD      91234   90000   0.25  
4     SomeTown    SD        298   30000   0.15  
5  Desert City    CO        123   68000   0.20  


In [21]:
df['Taxes Owed'] = df['Income']* df['Tax %']
print(df['Taxes Owed'])

0     9000.0
1     2700.0
2    30000.0
3    22500.0
4     4500.0
5    13600.0
Name: Taxes Owed, dtype: float64


### pandas.DataFrame.drop
- Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index or column names



In [22]:
# for convenience - remove  some columns for more space

to_drop = ['Area Code', 'First', 'Address']
df.drop(columns=to_drop, inplace=True)
print(df)

       Last         City State  Income  Tax %  Taxes Owed
0       Doe    Riverside    NJ   45000   0.20      9000.0
1  McGinnis        Phila    PA   18000   0.15      2700.0
2    Repici    Riverside    NJ  120000   0.25     30000.0
3     Tyler     SomeTown    SD   90000   0.25     22500.0
4  Blankman     SomeTown    SD   30000   0.15      4500.0
5       Jet  Desert City    CO   68000   0.20     13600.0


### pandas.DataFrame.loc

- changing all values based on one column
- if the income is below 60,000 -  change our false values in Test Col to true
- extremely large data set can use this to find any condition and change it based on other values

In [23]:
df['Test Col'] = False
df.loc[df['Income'] < 60000, 'Test Col'] = True
print(df)

       Last         City State  Income  Tax %  Taxes Owed  Test Col
0       Doe    Riverside    NJ   45000   0.20      9000.0      True
1  McGinnis        Phila    PA   18000   0.15      2700.0      True
2    Repici    Riverside    NJ  120000   0.25     30000.0     False
3     Tyler     SomeTown    SD   90000   0.25     22500.0     False
4  Blankman     SomeTown    SD   30000   0.15      4500.0      True
5       Jet  Desert City    CO   68000   0.20     13600.0     False


### pandas.DataFrame.groupby
- some combination of splitting the object, applying a function, and combining the results. 
- can be used to group large amounts of data and compute operations on these groups

In [24]:
# remove unnecessary columns

#  groups by the two unique values in the test column "False" and "True"
# mean function simply takes the average of all the data
# for all the columns that have false in the test column

print(df.groupby(['Test Col']).mean())

                Income     Tax %    Taxes Owed
Test Col                                      
False     92666.666667  0.233333  22033.333333
True      31000.000000  0.166667   5400.000000


### pandas.DataFrame.sort_values
- Sort by the values along either axis

In [25]:
print(df.groupby(['Test Col']).mean().sort_values('Income'))

                Income     Tax %    Taxes Owed
Test Col                                      
True      31000.000000  0.166667   5400.000000
False     92666.666667  0.233333  22033.333333


## (iv) Cleaning data with pandas



In [26]:
df = pd.read_csv('Names.csv', header = None)
df.columns = ['First', 'Last', 'Address', 'City', 'State', 'Area Code', 'Income']

### pandas.DataFrame.set_index
- Set the DataFrame index using existing columns
- indexing by area codes -  using an identifier that is unique for each object - search is easy

In [27]:
df.drop(columns='Address', inplace=True)

df=df.set_index('Area Code')

print(df.loc[8074])

First          John
Last            Doe
City      Riverside
State            NJ
Income        45000
Name: 8074, dtype: object


In [28]:
# same exact row - for iloc function also - index rows with the indexes

print(df.iloc[0])

First          John
Last            Doe
City      Riverside
State            NJ
Income        45000
Name: 8074, dtype: object


#### slicing method


In [29]:
#  location of index 8074 to the end of the row

print(df.loc[8074:, 'First'])

Area Code
8074                    John
9119                    Jack
8075           John "Da Man"
91234                Stephen
298                      NaN
123      Joan "Danger", Anne
Name: First, dtype: object


In [30]:
#  call each object to it's string value, and split 
# splits every word in the first column into it's own column

print(df.First.str.split(expand=True))

                 0          1     2
Area Code                          
8074          John       None  None
9119          Jack       None  None
8075          John        "Da  Man"
91234      Stephen       None  None
298            NaN        NaN   NaN
123           Joan  "Danger",  Anne


In [31]:
#  data frame only uses the first column of the split as the first name

df.First= df.First.str.split(expand=True)
print(df)

             First      Last         City State  Income
Area Code                                              
8074          John       Doe    Riverside    NJ   45000
9119          Jack  McGinnis        Phila    PA   18000
8075          John    Repici    Riverside    NJ  120000
91234      Stephen     Tyler     SomeTown    SD   90000
298            NaN  Blankman     SomeTown    SD   30000
123           Joan       Jet  Desert City    CO   68000


###  pandas.DataFrame.replace
- Replace values given in to_replace with value
- Values of the DataFrame are replaced with other values dynamically. 
- This differs from updating with .loc or .iloc, which require you to specify a location to update with some value

A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern

In [32]:
# locate the nan value of numpy and replace it with a string that can be easily identified

df= df.replace(np.NaN, 'N/A', regex=True)
print(df)

             First      Last         City State  Income
Area Code                                              
8074          John       Doe    Riverside    NJ   45000
9119          Jack  McGinnis        Phila    PA   18000
8075          John    Repici    Riverside    NJ  120000
91234      Stephen     Tyler     SomeTown    SD   90000
298            N/A  Blankman     SomeTown    SD   30000
123           Joan       Jet  Desert City    CO   68000


In [33]:
# save to excel
to_excel=df.to_excel('modified1.xlsx')