# Pandas

Open source library that provides high performance, easy-to-use data analysis tools for Python.
- Used for data manipulation and data visualization
- Panda data structures -> data frames and series

In [3]:
# Importing pandas library
import pandas as pd

## Series

In [9]:
# Creating a series from a tuple
series = pd.Series(('U', 'G', 'A', 'N', 'D', 'A'))
print("Series: ", series,end= "\n", sep = "\n")
print("Type: ", type(series))

Series: 
0    U
1    G
2    A
3    N
4    D
5    A
dtype: object
Type:  <class 'pandas.core.series.Series'>


In [12]:
# Creating a series from a list
series = pd.Series([1, 2, 3, 4, 5, 6])
print("Series: ", series, end = "\n", sep = "\n")
print("Type: ", type(series))

Series: 
0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64
Type:  <class 'pandas.core.series.Series'>


In [15]:
# Creating a date series
# Format : mm-dd-yyyy

date_series = pd.date_range(start = "02-01-2023", end = "02-24-2023")
print("Date series: ", date_series, sep = "\t")
print("Type: ", type(date_series), sep = '\t')

Date series: 	DatetimeIndex(['2023-02-01', '2023-02-02', '2023-02-03', '2023-02-04',
               '2023-02-05', '2023-02-06', '2023-02-07', '2023-02-08',
               '2023-02-09', '2023-02-10', '2023-02-11', '2023-02-12',
               '2023-02-13', '2023-02-14', '2023-02-15', '2023-02-16',
               '2023-02-17', '2023-02-18', '2023-02-19', '2023-02-20',
               '2023-02-21', '2023-02-22', '2023-02-23', '2023-02-24'],
              dtype='datetime64[ns]', freq='D')
Type: 	<class 'pandas.core.indexes.datetimes.DatetimeIndex'>


In [25]:
#Functions to use with date_range 

print("All the years: " , date_series.year, sep = "\t")
print("All the months: ", date_series.month)
print("All the names of the months: ", date_series.month_name())
print("All the days: ", date_series.day)
print("All the days of the week: ", date_series.day_name())
print("Which day of the year are they: ", date_series.day_of_year) # The days of the year range from 1 to 365

All the years: 	Int64Index([2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023,
            2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023, 2023,
            2023, 2023],
           dtype='int64')
All the months:  Int64Index([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2],
           dtype='int64')
All the names of the months:  Index(['February', 'February', 'February', 'February', 'February', 'February',
       'February', 'February', 'February', 'February', 'February', 'February',
       'February', 'February', 'February', 'February', 'February', 'February',
       'February', 'February', 'February', 'February', 'February', 'February'],
      dtype='object')
All the days:  Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24],
           dtype='int64')
All the days of the week:  Index(['Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'Monday',
     

In [44]:
# Indexing

integer_series = pd.Series([1, 2, 3, 4, 5, 6, 7])
print("Integers: ", integer_series, end="\n\n", sep ="\n")
print("The 0th element in the series: ", integer_series[0], end ="\n\n")
print("The first 4 elements in the series: ", integer_series[:4], end="\n\n", sep="\n")
print("The 1st and the 4th element in the series: ", integer_series[[0,3]], sep="\n")


Integers: 
0    1
1    2
2    3
3    4
4    5
5    6
6    7
dtype: int64

The 0th element in the series:  1

The first 4 elements in the series: 
0    1
1    2
2    3
3    4
dtype: int64

The 1st and the 4th element in the series: 
0    1
3    4
dtype: int64


In [50]:
# Renaming the indexes
integer_series = pd.Series([1, 2, 3, 4, 5], index = [101, 102, 103, 104, 105])
print("Series: ", integer_series, sep = "\n", end = "\n\n")

# Easier and more efficient way
integer_series = pd.Series([1, 2, 3, 4, 5], range(101, 106))
print("Series: ", integer_series, sep ="\n")

Series: 
101    1
102    2
103    3
104    4
105    5
dtype: int64

Series: 
101    1
102    2
103    3
104    4
105    5
dtype: int64


## Data Frames

In [59]:
student_df = pd.DataFrame({'Name': ['Joy', 'Grace', 'Purity', 'Dan', 'Alex'],
                         'Roll number': [1, 4, 9, 10, 14],
                         'Marks': [90, 40, 87, 65, 78]})
student_df

Unnamed: 0,Name,Roll number,Marks
0,Joy,1,90
1,Grace,4,40
2,Purity,9,87
3,Dan,10,65
4,Alex,14,78


In [61]:
# Re-indexing according to roll number
roll_number =[1, 4, 9, 14, 10]
student_df = pd.DataFrame({'Name': ['Joy', 'Grace','Purity', 'Dan', 'Alex'],
                          'Marks':[90, 40, 87, 65, 78]
        }, index = roll_number)
student_df

Unnamed: 0,Name,Marks
1,Joy,90
4,Grace,40
9,Purity,87
14,Dan,65
10,Alex,78


In [64]:
# Reading a csv file
# 3 ways
#dummy_df = pd.read_csv(r'C:\folder\file.csv')
#dummy_df = pd.read_csv('C:\\folder\file.csv')
#dummy_df = pd.read_csv('C:/folder/file.csv')

#OR:
dummy_df = pd.read_csv('dummy_data.csv')

# Setting the index to a certain value
dummy_df = dummy_df.set_index('id')
dummy_df

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dodi,MacCurley,Male
2,Pooh,Casado,Genderqueer
3,Krispin,Govinlock,Agender
4,Tiphany,Dabney,Bigender
5,Derry,Fehner,Genderfluid
6,Heriberto,Behninck,Bigender
7,Michal,Gath,Female
8,Stella,Shadwick,Genderfluid
9,Consuelo,Asty,Polygender
10,Amabel,Mortimer,Male


# Functions

In [65]:
# Viewing the first 6 records

dummy_df.head(6)

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dodi,MacCurley,Male
2,Pooh,Casado,Genderqueer
3,Krispin,Govinlock,Agender
4,Tiphany,Dabney,Bigender
5,Derry,Fehner,Genderfluid
6,Heriberto,Behninck,Bigender


In [66]:
# Viewing the last 6 records

dummy_df.tail(6)

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,Derry,Fehner,Genderfluid
6,Heriberto,Behninck,Bigender
7,Michal,Gath,Female
8,Stella,Shadwick,Genderfluid
9,Consuelo,Asty,Polygender
10,Amabel,Mortimer,Male


In [67]:
# Information about the dataframe

dummy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 1 to 10
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   first_name  10 non-null     object
 1   last_name   10 non-null     object
 2   gender      10 non-null     object
dtypes: object(3)
memory usage: 320.0+ bytes


In [69]:
# Shape of the data - 3 columns, 10 entries
dummy_df.shape

(10, 3)

In [71]:
# Describes the distribution of the data
# discrete and continuous
# Used for Integer and float values mostly
# Not used in categorical variables
dummy_df.describe()

Unnamed: 0,first_name,last_name,gender
count,10,10,10
unique,10,10,7
top,Dodi,MacCurley,Male
freq,1,1,2


In [72]:
# Gives you an index of all the columns
dummy_df.columns

Index(['first_name', 'last_name', 'gender'], dtype='object')

In [73]:
# Displaying the columns as a list
list(dummy_df.columns)

['first_name', 'last_name', 'gender']

In [75]:
# Displaying an array of all the rows
print(dummy_df.values, end = "\n\n")
print("The Oth element is ", dummy_df.values[0])

[['Dodi' 'MacCurley' 'Male']
 ['Pooh' 'Casado' 'Genderqueer']
 ['Krispin' 'Govinlock' 'Agender']
 ['Tiphany' 'Dabney' 'Bigender']
 ['Derry' 'Fehner' 'Genderfluid']
 ['Heriberto' 'Behninck' 'Bigender']
 ['Michal' 'Gath' 'Female']
 ['Stella' 'Shadwick' 'Genderfluid']
 ['Consuelo' 'Asty' 'Polygender']
 ['Amabel' 'Mortimer' 'Male']]

The Oth element is  ['Dodi' 'MacCurley' 'Male']


In [76]:
# Return the number of all the non-values in each row
dummy_df.count()

first_name    10
last_name     10
gender        10
dtype: int64

In [78]:
# To describe categorical data - object type variable
# Gives a count of all the unique values present in the columns
dummy_df['gender'].value_counts()

Male           2
Bigender       2
Genderfluid    2
Genderqueer    1
Agender        1
Female         1
Polygender     1
Name: gender, dtype: int64

In [82]:
# Selecting rows
# You cannot associate a single value and get the same output
dummy_df[0:1]

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dodi,MacCurley,Male


In [83]:
dummy_df[1:8:2] # values on a range of 1-8 with an increment of 2

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Pooh,Casado,Genderqueer
4,Tiphany,Dabney,Bigender
6,Heriberto,Behninck,Bigender
8,Stella,Shadwick,Genderfluid


In [89]:
# Accessing columns
# First method - Preferred
print("Gender: ", dummy_df['gender'],sep="\n")

# Second method
print("Gender: " , dummy_df.gender)

type(dummy_df['gender'])

Gender: 
id
1            Male
2     Genderqueer
3         Agender
4        Bigender
5     Genderfluid
6        Bigender
7          Female
8     Genderfluid
9      Polygender
10           Male
Name: gender, dtype: object
Gender:  id
1            Male
2     Genderqueer
3         Agender
4        Bigender
5     Genderfluid
6        Bigender
7          Female
8     Genderfluid
9      Polygender
10           Male
Name: gender, dtype: object


pandas.core.series.Series

In [92]:
# Displaying multiple columns
# Output is a dataframe

print(dummy_df[['first_name','gender']],sep="\n")
type(dummy_df)

   first_name       gender
id                        
1        Dodi         Male
2        Pooh  Genderqueer
3     Krispin      Agender
4     Tiphany     Bigender
5       Derry  Genderfluid
6   Heriberto     Bigender
7      Michal       Female
8      Stella  Genderfluid
9    Consuelo   Polygender
10     Amabel         Male


pandas.core.frame.DataFrame

## Indexing
- Label-based indexing -> df.loc
- Position-based indexing -> df.iloc

In [93]:
dummy_df

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dodi,MacCurley,Male
2,Pooh,Casado,Genderqueer
3,Krispin,Govinlock,Agender
4,Tiphany,Dabney,Bigender
5,Derry,Fehner,Genderfluid
6,Heriberto,Behninck,Bigender
7,Michal,Gath,Female
8,Stella,Shadwick,Genderfluid
9,Consuelo,Asty,Polygender
10,Amabel,Mortimer,Male


In [96]:
# 10th value from the gender column
dummy_df.loc[10, 'gender']

'Male'

In [99]:
# Printing the entire 10th row
print(dummy_df.loc[10])
type(dummy_df.loc[10])

first_name      Amabel
last_name     Mortimer
gender            Male
Name: 10, dtype: object


pandas.core.series.Series

In [100]:
# iloc -> use indexes
# The value of 0th row, 1st column

print(dummy_df.iloc[0,1])

MacCurley


In [104]:
# Printing the 10th row values -> index = 9

print(dummy_df.iloc[9])
print("Type: ", type(dummy_df.iloc[9]))

first_name      Amabel
last_name     Mortimer
gender            Male
Name: 10, dtype: object
Type:  <class 'pandas.core.series.Series'>


In [105]:
dummy_df

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dodi,MacCurley,Male
2,Pooh,Casado,Genderqueer
3,Krispin,Govinlock,Agender
4,Tiphany,Dabney,Bigender
5,Derry,Fehner,Genderfluid
6,Heriberto,Behninck,Bigender
7,Michal,Gath,Female
8,Stella,Shadwick,Genderfluid
9,Consuelo,Asty,Polygender
10,Amabel,Mortimer,Male


In [112]:
# Conditional statements
# How many rows where the gender is male

dummy_df[dummy_df.gender == 'Male']

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Dodi,MacCurley,Male
10,Amabel,Mortimer,Male


In [114]:
# Values of rows where index > 3 ->id

dummy_df[dummy_df.index > 3]

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,Tiphany,Dabney,Bigender
5,Derry,Fehner,Genderfluid
6,Heriberto,Behninck,Bigender
7,Michal,Gath,Female
8,Stella,Shadwick,Genderfluid
9,Consuelo,Asty,Polygender
10,Amabel,Mortimer,Male


In [117]:
# Rows where the index is divisible by 5

dummy_df[dummy_df.index % 5 == 0]

Unnamed: 0_level_0,first_name,last_name,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,Derry,Fehner,Genderfluid
10,Amabel,Mortimer,Male


In [127]:
# Count of all unique values present in the first_name column
# To do