 # Creating Dataframes

In [None]:
# Before we analyze anything, we need to import pandas
import pandas as pd

# Reading data from a csv file:
df = pd.read_csv('students.csv')

# Writing data to a csv file:
df = pd.to_csv('students_new.csv')

# Example of a .csv format in python, note no spaces before or after commas
name,cake_flavor,frosting_flavor,topping
Chocolate Cake,chocolate,chocolate,chocolate shavings
Birthday Cake,vanilla,vanilla,rainbow sprinkles
Carrot cake,carrot,cream cheese,almonds

In [None]:
# Ways of creating a Pandas DataFrame

# When creating from scratch must add the same number of fields in each row, or will not create a Dataframe object.

# Passing in a dictionary:
data = {'name':['Anthony', 'Maria'], 'age':[30, 28]}
df = pd.DataFrame(data)

# Passing in a dictionary 2
df1 = pd.DataFrame({
    'name': ['John Smith', 'Jane Doe', 'Joe Schmo'],
    'address': ['123 Main St.', '456 Maple Ave.', '789 Broadway'],
    'age': [34, 28, 51]
})

# Passing in a list of lists:
data = [['Tom', 20], ['Jack', 30], ['Meera', 25]]
df = pd.DataFrame(data, columns = ['Name', 'Age'])

#Passing in a list of lists 2
df2 = pd.DataFrame([
    ['John Smith', '123 Main St.', 34],
    ['Jane Doe', '456 Maple Ave.', 28],
    ['Joe Schmo', '789 Broadway', 51]
    ],
    columns=['name', 'address', 'age'])
 

# Previewing Data

In [None]:
# Previewing data

# Gives the top 5 rows
df.head()

# Or choose the number of rows
df.head(10)

# Gives statistics on each column
df.info()

# Selecting Columns

In [1]:
# Selecting a single column the result is called a Series, which is a one-dimensional object, similar to a Numpy array
# Series objects have a single axis label, like a column title, which is the index of the series. 
# A series is essentially a single column.

# Creating a series
clinic_east = pd.Series([100, 51, 81, 80, 51, 112])

# A dataframe is a two-dimensional object that can hold multiple columns of different types of data. 
# They are similar to a table in SQL.
# A single column of a dataframe is a series, and a dataframe is a container of two or more series objects.

print(type(df['clinic_north']))
# prints: <class 'pandas.core.series.Series'>
print(type(df))
# prints: <class 'pandas.core.frame.DataFrame'>

NameError: name 'pd' is not defined

In [None]:
# Selecting Columns
# There are two possible syntaxes for selecting all values from a column:

df['column_name']
# and
df.column_name
# Selecting rows where age is over 20
df[df.age > 20]
 
# Selectng rows where name is not John
df[df.name != "John"]
 
# Multiple logical conditions can be combined with OR (using |) and AND (using &), 
# and each condition must be enclosed in parentheses.

# Selecting rows where age is less than 10
# OR greater than 70
df[(df.age < 10) | (df.age > 70)]

df[(df.shoe_type == 'sandals') & (df.shoe_color == 'black')]


In [None]:
# Selecting multiple columns
# Use a list of column names, double brackets!!

new_df = orders[['last_name', 'email']]


# Selecting Rows

In [None]:
# Selecting Rows

# A row is also a one-dimensional object, also called a Series

# Using indexing (third row)
df.iloc[2]


# Selecting Multiple Rows

# all rows starting at the 3rd row and up to but not including the 7th row
df.iloc[3:7]
# would select all rows up to, but not including the 4th row
df.iloc[:4]
# would select the rows starting at the 3rd to last row and up to and including the final row
df.iloc[-3:]


# Can we skip rows? To skip a certain number of indexes per index, we can include a third, step, value.

# This selects values at indexes 0, 3, 6, 9.
# Python list
numbers[0:10:3]

# even rows only
orders.iloc[[0, 2, 4, 6]] 

In [None]:
# Select Rows with Logic
# You can select a subset of a DataFrame by using logical statements

# Return only columns that I want
df[df.MyColumnName == 'desired_column_value']

# This logical statement is comparing the values of column `buy_price`and column `sell_price`
gains = df[df.buy_price < df.sell_price]


# You can also combine multiple logical statements, as long as each statement is in parentheses.
# In Python, | means “or” and & means “and”.

df[(df.age < 30) |
   (df.name == 'Martha Jones')]

# We could use the isin command to check that df.name is one of a list of values. 
df[df.column_name.isin(['Martha Jones',
     'Rose Tyler',
     'Amy Pond'])]

# Resetting Indices

In [None]:

# When we select a subset of a DataFrame using logic, we end up with non-consecutive indices.
# We can fix this using the method .reset_index(). 
# drop=True means the old indices are NOT retained in a new column called , but dropped instead.
# inplace=True means that the existing DataFrame is modified, a new one is not created.

df.reset_index(drop=True, inplace=True)

# Note it's possible to set the index to a non-integer value, to the values of another column for instance,
# EG: the Country name of a Country Demographic Dataframe.

 # Modifying Dataframes

#### New Columns

In [None]:
# New columns
# Pandas DataFrames allow for the addition of columns after the DataFrame has already been created, 
# by using the format df['newColumn'] and setting it equal to the new column’s value.

# Specifying each value in the new column:
df['newColumn'] = [1, 2, 3, 4] 
 
# Setting each row in the new column to the same value:
df['newColumn'] = 1
df['Is taxed?'] = 'Yes'
df['Shipped?'] = True



# Creating a new column by doing a 
# calculation on an existing column:
df['newColumn'] = df['oldColumn'] * 5

# Note both ways to call a column / Pandas Series - dot or [brackets]
df['Margin'] = df.Price - df['Cost to Manufacture']

# We can perform an operation using multiple columns:
df['total'] = (df.price + (df.price * df.tax)) * df.quantity




# By default this will add the new column to the end of the table. You can use insert() method with indexing.

# Third position would be at index 2, because of zero-indexing.
df.insert(2, 'new-col', data)

#### Modify Columns and Rows

In [None]:
# apply()

# The Pandas apply() function can be used to apply a function on every value in a column or row of a DataFrame, 
# and transform that column or row to the resulting values.
# By default, it will apply a function to all values of a column. 
# To perform it on a row instead, you can specify the argument axis=1 in the apply() function call.

# EG: Create a new column from an existing column, applying lowercase formatting to it.
df['Lowercase Name'] = df.Name.apply(str.upper)


# You cannot use inplace=True like other methods, must reassign it.
# So CANNOT do...
df.apply(my_lambda)
# MUST reassign to df by doing...
df = df.apply(my_lambda)



# This function doubles the input value
def double(x):
  return 2*x
 
# Apply this function to double every value in a specified column
df.column1 = df.column1.apply(double)
 
    
    
# Lambda functions can also be supplied to `apply()`
# A lambda function is a way of defining a function in a single line of code. Usually, we would assign them to a variable.

For example, the following lambda function multiplies a number by 2 and then adds 3:
mylambda = lambda x: (x * 2) + 3
print(mylambda(5))
# Prints 13

# Here is an example that takes in a string, assigns it to the temporary variable x, and then converts it into lowercase:
stringlambda = lambda x: x.lower()
print(stringlambda("Oh Hi Mark!"))
# Prints "oh hi mark!"
    
df.column2 = df.column2.apply(lambda x : 3*x)
 
# Applying to a row requires it to be called on the entire DataFrame
df['newColumn'] = df.apply(lambda row: 
  row['column1'] * 1.5 + row['column2'],
  axis=1
)