# Pandas

## Reading data

In [None]:
import pandas as pd

# EXCEL
df = pd.read_excel('file_path.xlsx')

# CSV
df = pd.read_csv('file_path.csv')

# JSON
df = pd.read_json('file_path.json')

In [None]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('database_connection_string')
query = "SELECT * FROM table_name"
df = pd.read_sql(query, engine)

## Exploring data

In [2]:
import pandas as pd

data = {
    'Car Name': ['Toyota Corolla', 'Honda Civic', 'Ford Mustang', 'Chevrolet Camaro', 'Tesla Model 3', 'BMW X5', 'Audi A4'],
    'Price': [25000, 22000, 35000, 40000, 50000, 60000, 30000],
    'Is Broken': [False, True, True, False, True, False, False]
}

df_cars = pd.DataFrame(data)
df_cars

Unnamed: 0,Car Name,Price,Is Broken
0,Toyota Corolla,25000,False
1,Honda Civic,22000,True
2,Ford Mustang,35000,True
3,Chevrolet Camaro,40000,False
4,Tesla Model 3,50000,True
5,BMW X5,60000,False
6,Audi A4,30000,False


In [3]:
df_cars.head(n=3) # display the 3 rows

Unnamed: 0,Car Name,Price,Is Broken
0,Toyota Corolla,25000,False
1,Honda Civic,22000,True
2,Ford Mustang,35000,True


In [4]:
df_cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Car Name   7 non-null      object
 1   Price      7 non-null      int64 
 2   Is Broken  7 non-null      bool  
dtypes: bool(1), int64(1), object(1)
memory usage: 247.0+ bytes


In [5]:
df_cars.describe()

Unnamed: 0,Price
count,7.0
mean,37428.571429
std,13709.572603
min,22000.0
25%,27500.0
50%,35000.0
75%,45000.0
max,60000.0


In [None]:
# CSV read settings
df_cars = pd.read_csv('/content/cars.csv', sep=',')
df_cars = pd.read_csv('./cars.csv', delimiter=';')
df_cars = pd.read_csv('./cars.csv', header=3)
df_cars = pd.read_csv('./cars.csv', index_col='Car Name')

## Series

A series is a basic one-dimensional data structure in pandas.

In [6]:
ages_list = [21, 20, 25, 22]
names_list = ['Anna', 'Bob', 'Maria', 'Jack']

ages_series = pd.Series(ages_list, index=names_list, name='Age')
print(ages_series)

Anna     21
Bob      20
Maria    25
Jack     22
Name: Age, dtype: int64


In [10]:
student_ages_dict = {'Anna': 21, 'Bob': 20, 'Maria': 25, 'Jack': 22}

ages_series = pd.Series(student_ages_dict, name='Ages')
print(ages_series)

Anna     21
Bob      20
Maria    25
Jack     22
Name: Ages, dtype: int64


In [8]:
ages_series.index = ['A', 'B', 'M', 'J']
print(ages_series)

A    21
B    20
M    25
J    22
Name: Ages, dtype: int64


### Modifying a series object

In [11]:
ages_series['Jack'] = 23
print(ages_series)

Anna     21
Bob      20
Maria    25
Jack     23
Name: Ages, dtype: int64


In [12]:
new_ages_series = ages_series.drop(index='Maria')
print(new_ages_series)

Anna    21
Bob     20
Jack    23
Name: Ages, dtype: int64


In [13]:
ages_series['Maria'] = 25
print(ages_series)

Anna     21
Bob      20
Maria    25
Jack     23
Name: Ages, dtype: int64


In [14]:
new_recs = pd.Series({'Jon': 34, 'Peter': 23, 'Karo': 45, 'Abby': 25})
ages_series = pd.concat([ages_series, new_recs])
print(ages_series)

Anna     21
Bob      20
Maria    25
Jack     23
Jon      34
Peter    23
Karo     45
Abby     25
dtype: int64


## Dataframes

In [15]:
students_list = [['Anna', 'Smith', 21],
                 ['Bob', 'Jones', 20],
                 ['Maria', 'Williams', 25],
                 ['Jack', 'Brown', 22]]

students = pd.DataFrame(students_list, columns = ['First Name', 'Family Name', 'Age'])
students

Unnamed: 0,First Name,Family Name,Age
0,Anna,Smith,21
1,Bob,Jones,20
2,Maria,Williams,25
3,Jack,Brown,22


In [16]:
students_number = [100, 200, 300, 400]
students = pd.DataFrame(students_list,
                        columns = ['First Name', 'Family Name', 'Age'],
                        index = students_number)
students

Unnamed: 0,First Name,Family Name,Age
100,Anna,Smith,21
200,Bob,Jones,20
300,Maria,Williams,25
400,Jack,Brown,22


In [17]:
# This is a nested dictionary representing the students table
students_dict = {'First Name': {100: 'Anna',
                                200: 'Bob',
                                300: 'Maria',
                                400: 'Jack'},

                 'Family Name': {100: 'Smith',
                                 200: 'Jones',
                                 300: 'Williams',
                                 400: 'Brown'},
                 'Age': {100: 21,
                         200: 20,
                         300: 25,
                         400: 22}}

students = pd.DataFrame(students_dict)
students

Unnamed: 0,First Name,Family Name,Age
100,Anna,Smith,21
200,Bob,Jones,20
300,Maria,Williams,25
400,Jack,Brown,22


In [18]:
students.shape

(4, 3)

In [19]:
students.head(2)

Unnamed: 0,First Name,Family Name,Age
100,Anna,Smith,21
200,Bob,Jones,20


In [22]:
students.tail(2)

Unnamed: 0,First Name,Family Name,Age
300,Maria,Williams,25
400,Jack,Brown,22


In [20]:
students['Age']

100    21
200    20
300    25
400    22
Name: Age, dtype: int64

### Saving a dataframe to file

In [None]:
students.to_csv('student_names.csv', sep='\t', columns = ['First Name', 'Family Name'], index=False)

### Exercises

In [21]:
import pandas as pd


my_dict = {'A': {1: 1,
                 2: 4,
                 3: 6},
           'B': {1: 2,
                 2: 7,
                 3: 10},
           'C': {1: 3,
                 2: 11,
                 3: 16}}

my_df = pd.DataFrame(my_dict)

# print column C of my_df
my_df['C']

1     3
2    11
3    16
Name: C, dtype: int64