#### Pandas 

    - a powerful data manipulation library in Python, widely used for data analysis and data cleaning. 
    
    - It provides two primary data structures: 
    
    - "Series" - a one-dimensional array-like object, 
    
    - "DataFrame" is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns).

#### common functions used:

1- .Series(list)

2- .shape()

3- .DataFrame(list or dict)

4- .read_csv()

5- .head()

6- .tail()

7- .loc()

8- .iloc()		--> prints single value

9- .at[index_num, col_name]

10- .iat[row_num, col_num]

11- .dtypes

12- .describe()

13- .drop(col_name, axis=1, inplace = True)

In [315]:
import pandas as pd

In [316]:
# created series from a list

series1_num = pd.Series([11, 3, 5, 7])                  # 1D array
print(series1_num)     # 1st column are the indexes
print(type(series1_num)) 

0    11
1     3
2     5
3     7
dtype: int64
<class 'pandas.core.series.Series'>


In [317]:
# get shape of above   
series1_num.shape       # prints (4, )

(4,)

In [318]:
series1_str = pd.Series(['Hi', 'Bye'])      # 1D array
print(series1_str)

print(series1_str.shape)            # prints (2, )

0     Hi
1    Bye
dtype: object
(2,)


In [319]:
# created series from a dictonary, here the indexes will be the dictonary keys

dict_obj: dict[str, int] = {
    'a': 1,
    'b' : 2,
    'c' : 3
}

series2_num = pd.Series(dict_obj)
print(series2_num)         

print(series2_num.shape)            # prints (3, )

a    1
b    2
c    3
dtype: int64
(3,)


In [320]:
# provide custom indexes

list_2: list[int] = [20, 30, 400, 55]
ind: list[str] = ['alpha', 'beta', 'charlie', 'delta' ]

series3_num = pd.Series(list_2, index = ind)
print(series3_num)

alpha       20
beta        30
charlie    400
delta       55
dtype: int64


In [321]:
# create Dataframe from a dictonary of list values (dict keys will still be string)

person_dict: dict[str, list[str | int]] = {
    'Name': ['John', 'Bravo', 'Kilmer'],
    'Age': [64, 47, 42],
    'City': ['Austin', 'Lisbon', 'London']
}

df_person = pd.DataFrame(person_dict)
print(df_person)

print(df_person.shape)          # 3 x 3 matrix (3, 3)

     Name  Age    City
0    John   64  Austin
1   Bravo   47  Lisbon
2  Kilmer   42  London
(3, 3)


In [322]:
import numpy as np
np.array(df_person)         # skips dictonary key name i.e, 'Name', 'Age' & 'City'

array([['John', 64, 'Austin'],
       ['Bravo', 47, 'Lisbon'],
       ['Kilmer', 42, 'London']], dtype=object)

In [323]:
# create Dataframe from a list of dictonary

person_list = [
    {
        'Name': 'John',
        'Age': 64,
        'City': 'Austin'
    },
    {
        'Name': 'Bravo',
        'Age': 47,
        'City': 'Lisbon'
    },
    {
        'Name': 'Kilmer',
        'Age': 42,
        'City': 'London'
    }
]

df_person_1 = pd.DataFrame(person_list)
print(df_person_1)


     Name  Age    City
0    John   64  Austin
1   Bravo   47  Lisbon
2  Kilmer   42  London


#### Reading a csv (large files / aka datasets)

In [324]:
df_csv = pd.read_csv('./intel-stock-data-1980-2024.csv')

# see 1st 10 records
print(df_csv.head(10))

       Date      Open      High       Low     Close    Volume  Dividends  \
0  01-05-70  0.181500  0.184404  0.181500  0.181500  10924800        0.0   
1  01-05-70  0.181500  0.182952  0.180048  0.180048  17068800        0.0   
2  01-05-70  0.184404  0.187308  0.184404  0.184404  18508800        0.0   
3  01-05-70  0.184403  0.186581  0.183678  0.183678  11174400        0.0   
4  01-05-70  0.180048  0.180048  0.177143  0.177143  12172800        0.0   
5  01-05-70  0.176417  0.176417  0.173514  0.173514   8966400        0.0   
6  01-05-70  0.174240  0.177144  0.174240  0.174240  11347200        0.0   
7  01-05-70  0.174240  0.175692  0.172788  0.172788  16262400        0.0   
8  01-05-70  0.169884  0.169884  0.166980  0.166980  26918400        0.0   
9  01-05-70  0.173514  0.176417  0.173514  0.173514  20102400        0.0   

   Stock Splits  
0           0.0  
1           0.0  
2           0.0  
3           0.0  
4           0.0  
5           0.0  
6           0.0  
7           0.0  
8

In [325]:
# see last 5 records
df_csv.tail(10)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
11279,01-21-70,20.139999,20.25,19.870001,20.120001,75209300,0.0,0.0
11280,01-21-70,20.02,20.9,19.92,20.780001,77695000,0.0,0.0
11281,01-21-70,20.6,20.799999,20.25,20.34,55104100,0.0,0.0
11282,01-21-70,20.17,21.1,20.0,20.83,73832900,0.0,0.0
11283,01-21-70,20.639999,20.77,20.290001,20.440001,51767200,0.0,0.0
11284,01-21-70,20.48,20.58,19.209999,19.299999,86041200,0.0,0.0
11285,01-21-70,19.530001,19.690001,19.030001,19.059999,69117800,0.0,0.0
11286,01-21-70,18.969999,19.76,18.9,19.52,108453700,0.0,0.0
11287,01-21-70,19.58,20.23,19.52,20.200001,57988400,0.0,0.0
11288,01-21-70,20.219999,20.4,19.959999,20.4,29884000,0.0,0.0


In [326]:
df_person_1

Unnamed: 0,Name,Age,City
0,John,64,Austin
1,Bravo,47,Lisbon
2,Kilmer,42,London


In [327]:
# pick all names
print(df_person_1['Name'])      # prints a Series
print(type(df_person_1['Name']))

0      John
1     Bravo
2    Kilmer
Name: Name, dtype: object
<class 'pandas.core.series.Series'>


In [328]:
df_person_1.loc[0]                  # picks 0th row
# print(df_person_1.loc[0][0])        # picks column index as well BUT DON'T Use this - getting deprecated

Name      John
Age         64
City    Austin
Name: 0, dtype: object

In [329]:
# use below to pick column indices as well - iloc is index location picks row & col location both
df_person_1.iloc[0][2]

  df_person_1.iloc[0][2]


'Austin'

In [330]:
# Accessing a specified element

print(df_person_1.at[2, 'Name'])           # picks 2nd index against the 'Name' column
print(df_person_1.at[1, 'City'])            # picks 1st index from 'City' column

Kilmer
Lisbon


In [331]:
df_person_1.iat[2,2]        # access dierctly by row & col index

'London'

#### Data manipulation using Dataframes

In [332]:
# Adding a new column
df_person_1['Salary'] = [50000, 70000, 90000]
df_person_1

Unnamed: 0,Name,Age,City,Salary
0,John,64,Austin,50000
1,Bravo,47,Lisbon,70000
2,Kilmer,42,London,90000


In [333]:
# Display data types of each column names
print('Data types: ', df_person_1.dtypes,'\n')

# Describe the Dataframe        - ADVANCED DATA ANALYSIS concepts
print('Statistical Summary: ', df_person_1.describe())

Data types:  Name      object
Age        int64
City      object
Salary     int64
dtype: object 

Statistical Summary:               Age   Salary
count   3.000000      3.0
mean   51.000000  70000.0
std    11.532563  20000.0
min    42.000000  50000.0
25%    44.500000  60000.0
50%    47.000000  70000.0
75%    55.500000  80000.0
max    64.000000  90000.0


In [334]:
# Remove a column ex: 'Salary'

# won't work
# df_person_1.drop('Salary')
# df_person_1.drop('Salary', axis = 0)        # axis is 0 by default meaning checks row index by default
# df_person_1.drop('Salary', axis = 1)

# works - permanent operation
df_person_1.drop('Salary', axis = 1, inplace = True)            


In [335]:
# Remove a row - ex: last row with Name = 'Kilmer'
df_person_1.drop(2, inplace = True)
df_person_1

Unnamed: 0,Name,Age,City
0,John,64,Austin
1,Bravo,47,Lisbon


In [336]:
# Increment data to existing row values ex: Age
df_person_1['Age'] += 1
df_person_1


Unnamed: 0,Name,Age,City
0,John,65,Austin
1,Bravo,48,Lisbon
