# Pandas

## Reading data

In [26]:
import pandas as pd

# EXCEL
df = pd.read_excel('file_path.xlsx')

# CSV
df = pd.read_csv('file_path.csv')

# JSON
df = pd.read_json('file_path.json')

FileNotFoundError: [Errno 2] No such file or directory: 'file_path.xlsx'

In [27]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('database_connection_string')
query = "SELECT * FROM table_name"
df = pd.read_sql(query, engine)

ArgumentError: Could not parse SQLAlchemy URL from string 'database_connection_string'

## Exploring data

In [None]:
import pandas as pd

data = {
    'Car Name': ['Toyota Corolla', 'Honda Civic', 'Ford Mustang', 'Chevrolet Camaro', 'Tesla Model 3', 'BMW X5',
                 'Audi A4'],
    'Price': [25000, 22000, 35000, 40000, 50000, 60000, 30000],
    'Is Broken': [False, True, True, False, True, False, False]
}

df_cars = pd.DataFrame(data)
df_cars

In [None]:
df_cars.head(n=3)  # display the 3 rows

In [None]:
df_cars.info()

In [28]:
df_cars.describe()

NameError: name 'df_cars' is not defined

In [29]:
# CSV read settings
df_cars = pd.read_csv('/content/cars.csv', sep=',')
df_cars = pd.read_csv('./cars.csv', delimiter=';')
df_cars = pd.read_csv('./cars.csv', header=3)
df_cars = pd.read_csv('./cars.csv', index_col='Car Name')

FileNotFoundError: [Errno 2] No such file or directory: '/content/cars.csv'

## Series

A series is a basic one-dimensional data structure in pandas.

In [None]:
ages_list = [21, 20, 25, 22]
names_list = ['Anna', 'Bob', 'Maria', 'Jack']

ages_series = pd.Series(ages_list, index=names_list, name='Age')
print(ages_series)

In [30]:
student_ages_dict = {'Anna': 21, 'Bob': 20, 'Maria': 25, 'Jack': 22}

ages_series = pd.Series(student_ages_dict, name='Ages')
print(ages_series)

Anna     21
Bob      20
Maria    25
Jack     22
Name: Ages, dtype: int64


In [31]:
ages_series.index = ['A', 'B', 'M', 'J']
print(ages_series)

A    21
B    20
M    25
J    22
Name: Ages, dtype: int64


### Modifying a series object

In [32]:
ages_series['Jack'] = 23
print(ages_series)

A       21
B       20
M       25
J       22
Jack    23
Name: Ages, dtype: int64


In [33]:
new_ages_series = ages_series.drop(index='Maria')
print(new_ages_series)

KeyError: "['Maria'] not found in axis"

In [None]:
ages_series['Maria'] = 25
print(ages_series)

In [None]:
new_recs = pd.Series({'Jon': 34, 'Peter': 23, 'Karo': 45, 'Abby': 25})
ages_series = pd.concat([ages_series, new_recs])
print(ages_series)

## Dataframes

### Creating a dataframe

In [1]:
import pandas as pd

In [2]:
students_list = [['Anna', 'Smith', 21],
                 ['Bob', 'Jones', 20],
                 ['Maria', 'Williams', 25],
                 ['Jack', 'Brown', 22]]

students = pd.DataFrame(students_list, columns=['First Name', 'Family Name', 'Age'])
students

Unnamed: 0,First Name,Family Name,Age
0,Anna,Smith,21
1,Bob,Jones,20
2,Maria,Williams,25
3,Jack,Brown,22


In [None]:
students_number = [100, 200, 300, 400]
students = pd.DataFrame(students_list,
                        columns=['First Name', 'Family Name', 'Age'],
                        index=students_number)
students

In [None]:
# This is a nested dictionary representing the students table
students_dict = {'First Name': {100: 'Anna',
                                200: 'Bob',
                                300: 'Maria',
                                400: 'Jack'},

                 'Family Name': {100: 'Smith',
                                 200: 'Jones',
                                 300: 'Williams',
                                 400: 'Brown'},
                 'Age': {100: 21,
                         200: 20,
                         300: 25,
                         400: 22}}

students = pd.DataFrame(students_dict)
students

In [3]:
pets = {
    'species': ['cat', 'dog', 'parrot', 'cockroach'],
    'name': ['Dr. Mittens Lamar', 'Diesel', 'Peach', 'Richard'],
    'legs': [4, 4, 2, 6],
    'wings': [0, 0, 2, 4],
    'looking_for_home': ['no', 'no', 'no', 'yes']
}

pets_df = pd.DataFrame(pets)
pets_df.head()

Unnamed: 0,species,name,legs,wings,looking_for_home
0,cat,Dr. Mittens Lamar,4,0,no
1,dog,Diesel,4,0,no
2,parrot,Peach,2,2,no
3,cockroach,Richard,6,4,yes


### Parts of a Dataframe

#### Indexes
The index of a dataframe can be seen as the first column of the dataframe. These indexes can be in diffrent types like strinds, Datetime objects, floats, or integers. By default this is done by using integers, starting form 0. These indexes reflec the position of an element. To see the indexes used in a dataframe you can use the `df.axes`

In [6]:
pets_df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['species', 'name', 'legs', 'wings', 'looking_for_home'], dtype='object')]

For more detail, the `df.info()` function can also be used.

In [7]:
pets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   species           4 non-null      object
 1   name              4 non-null      object
 2   legs              4 non-null      int64 
 3   wings             4 non-null      int64 
 4   looking_for_home  4 non-null      object
dtypes: int64(2), object(3)
memory usage: 288.0+ bytes


#### Axes
In the dataframes created below, we can see two axes, the vertical rows, axis 0, and the horizatol columns, axis 1. No it doesnt make scecse for me neither but hey so be it. Let's take a look at the axes of our dataframe


The fist object in the dataframe is the indexing method for rows and the second for the coloms. You can also see the data type of the indexes 

### Accessing data in a dataframe

In [None]:
students.shape

In [None]:
students.head(2)

In [None]:
students.tail(2)

In [None]:
students['Age']

### Saving a dataframe to file

In [None]:
students.to_csv('student_names.csv', sep='\t', columns=['First Name', 'Family Name'], index=False)

### Exercises

In [None]:
import pandas as pd

my_dict = {'A': {1: 1,
                 2: 4,
                 3: 6},
           'B': {1: 2,
                 2: 7,
                 3: 10},
           'C': {1: 3,
                 2: 11,
                 3: 16}}

my_df = pd.DataFrame(my_dict)

# print column C of my_df
my_df['C']

## Exporting data

### CSV

In [None]:
import pandas as pd


def split_csv(filename):
    chunksize = 50
    i = 1
    for chunk in pd.read_csv(filename, chunksize=chunksize, sep=";", encoding='utf-8'):
        chunk.to_csv(f'TD RefDB.UA 2.{i} Beschibare og.csv'.format(i), sep=";", index=False)
        i += 1


split_csv('Test data/exports/TD RefDB.UA 2 Beschibare og.csv')