# Series and Dataframe

# 1. Pandas

In [1]:
import pandas as pd
import numpy as np

# Creating a Series from a python list with default numeric index

In [None]:

myindex = ['USA','Canada','England']
mydata = [1776,1867,1821]



Series with default index: 
 0    1776
1    1867
2    1821
dtype: int64


## Series with default integer Index

In [4]:

myser = pd.Series(data=mydata)
print("Series with default index: \n",myser)

Series with default index: 
 0    1776
1    1867
2    1821
dtype: int64


## Creating a series with custom index

In [3]:
myser = pd.Series(data=mydata, index=myindex)
print("Series with named index: \n",myser)

Series with named index: 
 USA        1776
Canada     1867
England    1821
dtype: int64


## Creating a series from Numpy array

In [5]:
ran_data = np.random.randint(0,100,4)
print("\nRandom data from Numpy array: ", ran_data)


Random data from Numpy array:  [85 38 13 40]


In [6]:
names = ['Alice','Bob','Charles','Dave']
ages = pd.Series(ran_data, index=names)
print("\nSeries from Numpy array: \n",ages)


Series from Numpy array: 
 Alice      85
Bob        38
Charles    13
Dave       40
dtype: int32


## Creating a series from Dictionary

In [7]:
ages_dict = {'Sammy':5, 'Frank': 10, 'Spike': 7}
print("\nSeries from Disctionary :\n", pd.Series(ages_dict))


Series from Disctionary :
 Sammy     5
Frank    10
Spike     7
dtype: int64


# Creating a Series from imaginary sales data

In [13]:
q1 = {'japan': 80, 'China': 450, 'India':200, 'USA':250}
q2 = {'Brazil': 100, 'China': 500, 'India':210, 'USA':260}

# Convert dictionaries into pandas series
sales_Q1 = pd.Series(q1)
sales_Q2 = pd.Series(q2)
print("\nSales data for Q1", sales_Q1)
# Accessing value by named index
print("\nSales in Japan for Q1",sales_Q1['japan'])

# integer based location indexing (default-index)
print("\nFirst entry in Q1 sales data: ", sales_Q1[0])


Sales data for Q1 japan     80
China    450
India    200
USA      250
dtype: int64

Sales in Japan for Q1 80

First entry in Q1 sales data:  80


  print("\nFirst entry in Q1 sales data: ", sales_Q1[0])


In [None]:
#errors
print(sales_Q1['France']) #Non-existent key
print(sales_Q1['USA ']) # Extra space in key
print(sales_Q1['usa'])  #case mismatch


In [14]:
# Series Operations
print("\nSeries keys : ",sales_Q1.keys())


Series keys :  Index(['japan', 'China', 'India', 'USA'], dtype='object')


In [15]:
# Broadcasting operation
print("\nSales Q1 doubled:\n", sales_Q1*2)
print("\nSales Q2 divided by 100:\n", sales_Q2/100)


Sales Q1 doubled:
 japan    160
China    900
India    400
USA      500
dtype: int64

Sales Q2 divided by 100:
 Brazil    1.0
China     5.0
India     2.1
USA       2.6
dtype: float64


In [17]:
# Handling mismatch indices
print("\nSales Q1 + Sales Q2 (with NaN where no matching key): \n", sales_Q1+sales_Q2)

#Handling missing data (filling NaN with default value)
print("\Sales Q1 + Salees Q2 (fill NaN with 0):\n",sales_Q1.add(sales_Q2, fill_value=0))


Sales Q1 + Sales Q2 (with NaN where no matching key): 
 Brazil      NaN
China     950.0
India     410.0
USA       510.0
japan       NaN
dtype: float64
\Sales Q1 + Salees Q2 (fill NaN with 0):
 Brazil    100.0
China     950.0
India     410.0
USA       510.0
japan      80.0
dtype: float64


# 2. DataFrame

In [30]:
np.random.seed(101)
mydata = np.random.randint(0,101,(4,3))

print("Generated Data (mydata): \n", mydata)

myindex = ['CA','NY','AZ','TX']
mycolumns = ['Jan','Feb','Mar']

# creating dataframe without index and xolumns
df = pd.DataFrame(data=mydata)
print("\nDataframe without index/cols: \n", df)




Generated Data (mydata): 
 [[95 11 81]
 [70 63 87]
 [75  9 77]
 [40  4 63]]

Dataframe without index/cols: 
     0   1   2
0  95  11  81
1  70  63  87
2  75   9  77
3  40   4  63


In [None]:

df = pd.DataFrame(data=mydata, index=myindex)
print("\nDataFrame with custom row index : \n", df)

df = pd.DataFrame(data=mydata, columns=mycolumns)
print("\nDataFrame with custom and column names: \n", df)

# create dataframe with custom index
df = pd.DataFrame(data=mydata, index=myindex, columns=mycolumns)
print("\nDataFrame with custom row index and column names: \n", df)



DataFrame with custom row index : 
      0   1   2
CA  95  11  81
NY  70  63  87
AZ  75   9  77
TX  40   4  63

DataFrame with custom and column names: 
    Jan  Feb  Mar
0   95   11   81
1   70   63   87
2   75    9   77
3   40    4   63

DataFrame with custom row index and column names: 
     Jan  Feb  Mar
CA   95   11   81
NY   70   63   87
AZ   75    9   77
TX   40    4   63


In [36]:
# Displaying information about the dataframe
print("\nDataFrame Info:")
print(df.info())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, CA to TX
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Jan     4 non-null      int32
 1   Feb     4 non-null      int32
 2   Mar     4 non-null      int32
dtypes: int32(3)
memory usage: 80.0+ bytes
None


# Creating a Dataframe from a CSV file

In [47]:
df = pd.read_csv('./data_pd/tips.csv')
# print(df.columns)
# print(df.index)
# print(df.head(3))
# print(df.tail(3))
# print(len(df))
# print(df.describe())
# df.describe().transpose()
df.head(3)



Unnamed: 0,total_bill,tip,gender,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560330000000000.0,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478070000000000.0,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011810000000000.0,Sun4458


In [50]:
# column selection and indexing
# selecting a single column
print(df['total_bill'])
print("="*100)
print(type(df['total_bill']))

0      16.99
1      10.34
2      21.01
3      23.68
4      24.59
       ...  
239    29.03
240    27.18
241    22.67
242    17.82
243    18.78
Name: total_bill, Length: 244, dtype: float64
<class 'pandas.core.series.Series'>


In [52]:
# Selecting mutiple columns (use a list of column names)
# print(df[['total_bill', 'tip']])
df[['total_bill', 'tip']]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


In [53]:
# Create new columns based on existing ones

df['tip_percentage'] = 100 * df['tip'] / df['total_bill']
df['price_per_person'] = df['total_bill']/ df['size']
df.head()

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495,Christy Cunningham,3560330000000000.0,Sun2959,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,Douglas Tucker,4478070000000000.0,Sun4608,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,Travis Walters,6011810000000000.0,Sun4458,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676140000000000.0,Sun5260,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,Tonya Carter,4832730000000000.0,Sun2251,14.680765


In [54]:
# Adjust the 'price_per_person by rounding it to 2 decimal places
df['price_per_person'] = np.round(df['price_per_person'], 2)
df.head()

Unnamed: 0,total_bill,tip,gender,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,tip_percentage
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560330000000000.0,Sun2959,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478070000000000.0,Sun4608,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011810000000000.0,Sun4458,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676140000000000.0,Sun5260,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832730000000000.0,Sun2251,14.680765
