# __PANDAS__

In [3]:
%pip install pandera

Collecting pandera
  Downloading pandera-0.18.0-py3-none-any.whl.metadata (15 kB)
Collecting multimethod (from pandera)
  Downloading multimethod-1.11.1-py3-none-any.whl.metadata (8.8 kB)
Collecting pydantic (from pandera)
  Downloading pydantic-2.6.1-py3-none-any.whl.metadata (83 kB)
     ---------------------------------------- 0.0/83.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/83.5 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/83.5 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/83.5 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/83.5 kB ? eta -:--:--
     ------------- ------------------------ 30.7/83.5 kB 186.2 kB/s eta 0:00:01
     ------------- ------------------------ 30.7/83.5 kB 186.2 kB/s eta 0:00:01
     ------------- ------------------------ 30.

In [2]:
import pandas as pd
import pandera as pa

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Series  

In [3]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
s1 : pd.Series = pd.Series((1,2,3,4,5))
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
l1 : list[int] = [1,2,3,4,5,6,7,8,9]
l1

[1, 2, 3, 4, 5, 6, 7, 8, 9]

#### Series has automatic index while list hasn't 
* Series can be made from list
* Series can be made from tuple
* Series cannot be made from set
* Series can be made from dictionary with keys as indexes

In [6]:
s1 : pd.Series = pd.Series({1,2,2,3,3,3,8})
s1

TypeError: 'set' type is unordered

In [9]:
s1 : pd.Series = pd.Series({'a':10,'b':20,'c':30,'d':40})
s1

a    10
b    20
c    30
d    40
dtype: int64

In [7]:
indexes : list[int] = [1,2,3,4,5]
values : list[str] = ["a","b","c","d","e"]
s1 : pd.Series = pd.Series(values,index = indexes)
s1

1    a
2    b
3    c
4    d
5    e
dtype: object

## MULTIPLE INDEXING

In [8]:
values : list[int] = [1,2,3,4,5]
indexes : list[list[str]] = [["a1","a1","a1","b1","b1"],
                            ["a","b","c","d","e"]]
s1 : pd.Series = pd.Series(values,index = indexes)
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
dtype: int64

In [27]:
values : list[int] = [1,2,3,4,5]
indexes : list[list[str]] = [["a1","a1","a1","b1","b1"],
                            ["a","b","c","d","e"]]
s1 : pd.Series = pd.Series(values,index = indexes,name = "Student Data")
s1

a1  a    1
    b    2
    c    3
b1  d    4
    e    5
Name: Student Data, dtype: int64

In [9]:
values : list[int] = [1,2,3,4,5,6,7,8,9,10]
indexes : list[list[str]] = [["A1","A1","A1","A1","A1","T1","T1","T1","T1","T1"],
                             ["a1","a1","a1","c1","c1","b1","b1","b1","d1","d1"],
                            ["a","b","c","d","e","f","g","h","i","j"]]
s1 : pd.Series = pd.Series(values,index = indexes,name = "Student Data")
s1

A1  a1  a     1
        b     2
        c     3
    c1  d     4
        e     5
T1  b1  f     6
        g     7
        h     8
    d1  i     9
        j    10
Name: Student Data, dtype: int64

# DATAFRAME

In [10]:
s1 : pd.Series = pd.Series([1,2,3,4,5],name = "Student ID")
s2 : pd.Series = pd.Series([10,20,30,40,50],name = "Student Marks")
s3 : pd.Series = pd.Series(["A","B","C","D","E"],name = "Student Name")

In [11]:
df : pd.DataFrame = pd.DataFrame({"Student ID":s1,"Student Marks":s2,"Student Name":s3})
df

Unnamed: 0,Student ID,Student Marks,Student Name
0,1,10,A
1,2,20,B
2,3,30,C
3,4,40,D
4,5,50,E


In [12]:
df : pd.DataFrame = pd.concat([s1,s2,s3],axis = 1)
df

Unnamed: 0,Student ID,Student Marks,Student Name
0,1,10,A
1,2,20,B
2,3,30,C
3,4,40,D
4,5,50,E


In [13]:
data : list[list[int]] = [[1,2,3],
                          [4,5,6],
                          [7,8,9]]

df : pd.DataFrame = pd.DataFrame(data,columns = ["A","B","C"])
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [16]:
data : list[list[int]] = [[1,2,3],
                          [4,5,6],
                          [7,8,9]]

df : pd.DataFrame = pd.DataFrame(data,columns = ["A","B","C"],index= ['x','y','z'])
df

Unnamed: 0,A,B,C
x,1,2,3
y,4,5,6
z,7,8,9


In [14]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [15]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [16]:
df.values

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]], dtype=int64)

## Exercise
##### Create an array upto 99 numbers and convert it to dataframe

In [17]:
import numpy as np
from typing import Any
from nptyping import NDArray,Shape,UInt64

data : NDArray[Shape["10,10"],Any]= np.arange(10*10).reshape(10,10)
data

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [18]:
data = pd.DataFrame(data,columns = list('ABCDEFGHIJ'))
data

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [None]:
data : list[pd.DataFrame] = pd.read_html('https://www.w3schools.com/python/python_operators.asp#gsc.tab=0')
data

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.4
166,60,115,145,310.2
167,75,120,150,320.4


In [None]:
data[0]

Unnamed: 0,Operator,Name,Example,Try it
0,+,Addition,x + y,Try it »
1,-,Subtraction,x - y,Try it »
2,*,Multiplication,x * y,Try it »
3,/,Division,x / y,Try it »
4,%,Modulus,x % y,Try it »
5,**,Exponentiation,x ** y,Try it »
6,//,Floor division,x // y,Try it »


In [None]:
data2 = pd.read_json('https://www.w3schools.com/python/pandas/data.js')
data2

NameError: name 'pd' is not defined

In [1]:
print("jhf")

jhf


In [7]:
import pandas as pd
import pandera as pa

df = pd.DataFrame(
    {
        "Column1":[1,4,0,10,9],
        "Column2":[-1.3,-1.4,-2.9,-10.1,-20.4],
        "Column3":["value_1","value_2","value_3","value_4","value_5"]
    }
)

schema = pa.DataFrameSchema({
    'Column1':pa.Column(checks = pa.Check.le(10)),
    'Column2':pa.Column(checks = pa.Check.lt(-1.2)),
    'Column3':pa.Column(checks = [pa.Check.str_startswith("value_")
                                  ,pa.Check(lambda s : s.str.split('_',expand = True).shape[1] == 2)])
})

validated_df = schema(df)
print(validated_df)

   Column1  Column2  Column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_4
4        9    -20.4  value_5


In [8]:
dir(pa.Check)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_get_check_fn_code',
 'between',
 'eq',
 'equal_to',
 'equal_to',
 'from_builtin_check_name',
 'ge',
 'get_backend',
 'get_builtin_check_fn',
 'greater_than',
 'greater_than',
 'greater_than_or_equal_to',
 'greater_than_or_equal_to',
 'gt',
 'in_range',
 'in_range',
 'isin',
 'isin',
 'le',
 'less_than',
 'less_than',
 'less_than_or_equal_to',
 'less_than_or_equal_to',
 'lt',
 'ne',
 'not_equal_to',
 'not_equal_to',
 'notin',
 'notin',
 'one_sample_ttest',
 'register_backend',
 'register_builtin_check_fn',
 'str_contains',
 'str_contains',
 'str_endswith',
 'str_endswith',


# __SLICING & INDEXING__
* Series[index]
* Dataframe
    * loc
    * iloc
    * at
    * iat

In [9]:
s1 : pd.Series = pd.Series([1,2,3,4,5])
s1

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [10]:
s1[4]

5

In [11]:
s1[:4]

0    1
1    2
2    3
3    4
dtype: int64

In [13]:
s1[0:4]

0    1
1    2
2    3
3    4
dtype: int64

<table>
    <thead>
        <th>LOC</th>
        <th>ILOC</th>
    </thead>
    <tbody>
    <tr>
        <td>Can be accessed by the name of the column or label of the row</td>
        <td>Only accessed by the indexed of the row</td>
    </tr>
    <tr>
        <td>Includes the last index too</td>
        <td>Exclude the last index</td>
    </tr>
    <tr>
        <td>df.loc['a':'e'] = a,b,c,d,e</td>
        <td>df.iloc[0:5] = 0,1,2,3,4</td>
    </tr>
    </tbody>
</table>

In [14]:
s1.iloc[1:4]

1    2
2    3
3    4
dtype: int64

In [16]:
s1.iloc[0:4]

0    1
1    2
2    3
3    4
dtype: int64

In [17]:
s1.loc[2]

3

In [20]:
values : list[int] = [1,2,3,4,5]
indexes : list[str] = ["a","b","c","d","e"]
s1 : pd.Series = pd.Series(values,index = indexes)
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [25]:
s1.loc['a':'e']

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [26]:
s1.iloc[0:4]

a    1
b    2
c    3
d    4
dtype: int64

<table>
    <thead>
        <th>AT</th>
        <th>IAT</th>
    </thead>
    <tbody>
    <tr>
        <td>Returns a particular label value</td>
        <td>Returns a particular Index value</td>
    </tr>
    <tr>
        <td>df/series . at [label] = value</td>
        <td>df/series . iat [index] = value</td>
    </tr>
    </tbody>
</table>

In [27]:
s1.iat[1] 

2

In [28]:
s1.at['a']

1

# __REGEX__

In [29]:
import re
x = """
Engr Kanwal Shehzadi (Project Manager)
"""
pattern = "Kanwal"
re.findall(pattern,x)

['Kanwal']

In [40]:
import re 
x = """
22:22:22 PM KanwalShehzadi 
PIAIC-12345
22:22:22 PM Kanwal Shehzadi 
PIAIC-1234
22:22:22 PM Kanwal Shehzadi 
PIAIC-12345
22:22:22 PM Kanwal Shehzadi 
PIAIC-12345
22:22:22 PM Kanwal Shehzadi 
PIAIC-12345
"""
pattern = fr'(\d{2}:\d{2}:\d{2}) PM (.*)\n(PIAIC-? ?\d{5,6})'
reg = re.findall(pattern,x)
reg

[]