In [2]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
dates = pd.date_range("20240919", periods=10)

In [6]:
dates

DatetimeIndex(['2024-09-19', '2024-09-20', '2024-09-21', '2024-09-22',
               '2024-09-23', '2024-09-24', '2024-09-25', '2024-09-26',
               '2024-09-27', '2024-09-28'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list("ABCD"))

In [8]:
df

Unnamed: 0,A,B,C,D
2024-09-19,2.498754,0.681015,-0.629096,-1.65442
2024-09-20,1.622092,-1.078239,-0.365883,1.223361
2024-09-21,-0.219813,-0.768508,0.238645,1.347021
2024-09-22,-0.614123,1.48614,-0.221546,-0.085369
2024-09-23,-0.022053,0.173989,0.187356,0.727575
2024-09-24,2.159009,-0.220985,-1.49169,-1.464994
2024-09-25,1.821493,1.037353,0.706072,-0.210938
2024-09-26,0.365224,1.036458,0.226146,0.157349
2024-09-27,0.593835,-0.54305,1.264084,-0.617011
2024-09-28,-0.682238,-0.024755,-0.349666,0.807032


In [9]:
df2 = pd.DataFrame(
    {
        "A": 1.5,
        "B": pd.Timestamp("20240919"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([4] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

In [10]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.5,2024-09-19,1.0,4,test,foo
1,1.5,2024-09-19,1.0,4,train,foo
2,1.5,2024-09-19,1.0,4,test,foo
3,1.5,2024-09-19,1.0,4,train,foo


In [11]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [12]:
df.head()

Unnamed: 0,A,B,C,D
2024-09-19,2.498754,0.681015,-0.629096,-1.65442
2024-09-20,1.622092,-1.078239,-0.365883,1.223361
2024-09-21,-0.219813,-0.768508,0.238645,1.347021
2024-09-22,-0.614123,1.48614,-0.221546,-0.085369
2024-09-23,-0.022053,0.173989,0.187356,0.727575


In [13]:
df.tail()

Unnamed: 0,A,B,C,D
2024-09-24,2.159009,-0.220985,-1.49169,-1.464994
2024-09-25,1.821493,1.037353,0.706072,-0.210938
2024-09-26,0.365224,1.036458,0.226146,0.157349
2024-09-27,0.593835,-0.54305,1.264084,-0.617011
2024-09-28,-0.682238,-0.024755,-0.349666,0.807032


In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.752218,0.177942,-0.043558,0.022961
std,1.181958,0.858547,0.756218,1.044936
min,-0.682238,-1.078239,-1.49169,-1.65442
25%,-0.170373,-0.462533,-0.361828,-0.515493
50%,0.479529,0.074617,-0.017095,0.03599
75%,1.771643,0.947597,0.23552,0.787167
max,2.498754,1.48614,1.264084,1.347021


In [15]:
df.info

<bound method DataFrame.info of                    A         B         C         D
2024-09-19  2.498754  0.681015 -0.629096 -1.654420
2024-09-20  1.622092 -1.078239 -0.365883  1.223361
2024-09-21 -0.219813 -0.768508  0.238645  1.347021
2024-09-22 -0.614123  1.486140 -0.221546 -0.085369
2024-09-23 -0.022053  0.173989  0.187356  0.727575
2024-09-24  2.159009 -0.220985 -1.491690 -1.464994
2024-09-25  1.821493  1.037353  0.706072 -0.210938
2024-09-26  0.365224  1.036458  0.226146  0.157349
2024-09-27  0.593835 -0.543050  1.264084 -0.617011
2024-09-28 -0.682238 -0.024755 -0.349666  0.807032>

In [16]:
df.T

Unnamed: 0,2024-09-19,2024-09-20,2024-09-21,2024-09-22,2024-09-23,2024-09-24,2024-09-25,2024-09-26,2024-09-27,2024-09-28
A,2.498754,1.622092,-0.219813,-0.614123,-0.022053,2.159009,1.821493,0.365224,0.593835,-0.682238
B,0.681015,-1.078239,-0.768508,1.48614,0.173989,-0.220985,1.037353,1.036458,-0.54305,-0.024755
C,-0.629096,-0.365883,0.238645,-0.221546,0.187356,-1.49169,0.706072,0.226146,1.264084,-0.349666
D,-1.65442,1.223361,1.347021,-0.085369,0.727575,-1.464994,-0.210938,0.157349,-0.617011,0.807032


In [17]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D
2024-09-19,2.498754,0.681015,-0.629096,-1.65442
2024-09-20,1.622092,-1.078239,-0.365883,1.223361
2024-09-21,-0.219813,-0.768508,0.238645,1.347021
2024-09-22,-0.614123,1.48614,-0.221546,-0.085369
2024-09-23,-0.022053,0.173989,0.187356,0.727575
2024-09-24,2.159009,-0.220985,-1.49169,-1.464994
2024-09-25,1.821493,1.037353,0.706072,-0.210938
2024-09-26,0.365224,1.036458,0.226146,0.157349
2024-09-27,0.593835,-0.54305,1.264084,-0.617011
2024-09-28,-0.682238,-0.024755,-0.349666,0.807032


In [18]:
df.sort_values(by="C")

Unnamed: 0,A,B,C,D
2024-09-24,2.159009,-0.220985,-1.49169,-1.464994
2024-09-19,2.498754,0.681015,-0.629096,-1.65442
2024-09-20,1.622092,-1.078239,-0.365883,1.223361
2024-09-28,-0.682238,-0.024755,-0.349666,0.807032
2024-09-22,-0.614123,1.48614,-0.221546,-0.085369
2024-09-23,-0.022053,0.173989,0.187356,0.727575
2024-09-26,0.365224,1.036458,0.226146,0.157349
2024-09-21,-0.219813,-0.768508,0.238645,1.347021
2024-09-25,1.821493,1.037353,0.706072,-0.210938
2024-09-27,0.593835,-0.54305,1.264084,-0.617011


In [19]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2024-09-22,-0.614123,1.48614
2024-09-23,-0.022053,0.173989


In [20]:
df[df > 0]

Unnamed: 0,A,B,C,D
2024-09-19,2.498754,0.681015,,
2024-09-20,1.622092,,,1.223361
2024-09-21,,,0.238645,1.347021
2024-09-22,,1.48614,,
2024-09-23,,0.173989,0.187356,0.727575
2024-09-24,2.159009,,,
2024-09-25,1.821493,1.037353,0.706072,
2024-09-26,0.365224,1.036458,0.226146,0.157349
2024-09-27,0.593835,,1.264084,
2024-09-28,,,,0.807032


In [21]:
df.mean()

A    0.752218
B    0.177942
C   -0.043558
D    0.022961
dtype: float64

In [22]:
df.median()

A    0.479529
B    0.074617
C   -0.017095
D    0.035990
dtype: float64

In [23]:
df.mode()

Unnamed: 0,A,B,C,D
0,-0.682238,-1.078239,-1.49169,-1.65442
1,-0.614123,-0.768508,-0.629096,-1.464994
2,-0.219813,-0.54305,-0.365883,-0.617011
3,-0.022053,-0.220985,-0.349666,-0.210938
4,0.365224,-0.024755,-0.221546,-0.085369
5,0.593835,0.173989,0.187356,0.157349
6,1.622092,0.681015,0.226146,0.727575
7,1.821493,1.036458,0.238645,0.807032
8,2.159009,1.037353,0.706072,1.223361
9,2.498754,1.48614,1.264084,1.347021


In [24]:
df.transform(lambda x: x * 101.2)

Unnamed: 0,A,B,C,D
2024-09-19,252.873892,68.918737,-63.664558,-167.427269
2024-09-20,164.155718,-109.117741,-37.02733,123.80413
2024-09-21,-22.245126,-77.773046,24.150841,136.318534
2024-09-22,-62.149281,150.397329,-22.420468,-8.63937
2024-09-23,-2.231734,17.607712,18.960379,73.630561
2024-09-24,218.491706,-22.363715,-150.959047,-148.257384
2024-09-25,184.335071,104.980122,71.454472,-21.346956
2024-09-26,36.960634,104.889547,22.885994,15.923719
2024-09-27,60.096102,-54.956611,127.925295,-62.441496
2024-09-28,-69.042514,-2.505157,-35.386186,81.671598


In [25]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [26]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})

right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})

left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [27]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.682238,-0.024755,-0.349666,0.807032
-0.614123,1.48614,-0.221546,-0.085369
-0.219813,-0.768508,0.238645,1.347021
-0.022053,0.173989,0.187356,0.727575
0.365224,1.036458,0.226146,0.157349
0.593835,-0.54305,1.264084,-0.617011
1.622092,-1.078239,-0.365883,1.223361
1.821493,1.037353,0.706072,-0.210938
2.159009,-0.220985,-1.49169,-1.464994
2.498754,0.681015,-0.629096,-1.65442


In [28]:
df_data = pd.read_csv(r"C:\Users\AWAIS LAPTOP STORE\Desktop\Python\data\titanic.csv")

In [29]:
df_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [31]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [33]:
df_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [34]:
df_data.isnull().sum

<bound method DataFrame.sum of      PassengerId  Survived  Pclass   Name    Sex    Age  SibSp  Parch  Ticket  \
0          False     False   False  False  False  False  False  False   False   
1          False     False   False  False  False  False  False  False   False   
2          False     False   False  False  False  False  False  False   False   
3          False     False   False  False  False  False  False  False   False   
4          False     False   False  False  False  False  False  False   False   
..           ...       ...     ...    ...    ...    ...    ...    ...     ...   
886        False     False   False  False  False  False  False  False   False   
887        False     False   False  False  False  False  False  False   False   
888        False     False   False  False  False   True  False  False   False   
889        False     False   False  False  False  False  False  False   False   
890        False     False   False  False  False  False  False  False   False 

In [37]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoding_col = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
for col in encoding_col:
    df_data[col] = encoder.fit_transform(df_data[col])


In [38]:
df_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.2500,147,2
1,2,1,1,190,0,38.0,1,0,596,71.2833,81,0
2,3,1,3,353,0,26.0,0,0,669,7.9250,147,2
3,4,1,1,272,0,35.0,1,0,49,53.1000,55,2
4,5,0,3,15,1,35.0,0,0,472,8.0500,147,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,548,1,27.0,0,0,101,13.0000,147,2
887,888,1,1,303,0,19.0,0,0,14,30.0000,30,2
888,889,0,3,413,0,,1,2,675,23.4500,147,2
889,890,1,1,81,1,26.0,0,0,8,30.0000,60,0


In [39]:
X = df_data.drop(['Age'], axis=1)
y = df_data['Age']

In [40]:
df_data.fillna(value = 2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.2500,147,2
1,2,1,1,190,0,38.0,1,0,596,71.2833,81,0
2,3,1,3,353,0,26.0,0,0,669,7.9250,147,2
3,4,1,1,272,0,35.0,1,0,49,53.1000,55,2
4,5,0,3,15,1,35.0,0,0,472,8.0500,147,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,548,1,27.0,0,0,101,13.0000,147,2
887,888,1,1,303,0,19.0,0,0,14,30.0000,30,2
888,889,0,3,413,0,2.0,1,2,675,23.4500,147,2
889,890,1,1,81,1,26.0,0,0,8,30.0000,60,0


In [41]:
df_data.mean()

PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Name           445.000000
Sex              0.647587
Age             29.699118
SibSp            0.523008
Parch            0.381594
Ticket         338.528620
Fare            32.204208
Cabin          130.744108
Embarked         1.538721
dtype: float64

In [47]:
df_data.groupby('Name')[['Fare', 'Sex']].mean()

Unnamed: 0_level_0,Fare,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7.5500,1.0
1,20.2500,1.0
2,20.2500,0.0
3,24.0000,1.0
4,24.0000,0.0
...,...,...
886,9.5000,1.0
887,9.5000,1.0
888,27.7208,1.0
889,14.5000,1.0


In [48]:
stacked = df_data.stack(future_stack=True)
stacked

0    PassengerId      1.00
     Survived         0.00
     Pclass           3.00
     Name           108.00
     Sex              1.00
                     ...  
890  Parch            0.00
     Ticket         466.00
     Fare             7.75
     Cabin          147.00
     Embarked         1.00
Length: 10692, dtype: float64

In [49]:
stacked.unstack(sort= 'Fare')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3.0,108.0,1.0,22.0,1.0,0.0,523.0,7.2500,147.0,2.0
1,2.0,1.0,1.0,190.0,0.0,38.0,1.0,0.0,596.0,71.2833,81.0,0.0
2,3.0,1.0,3.0,353.0,0.0,26.0,0.0,0.0,669.0,7.9250,147.0,2.0
3,4.0,1.0,1.0,272.0,0.0,35.0,1.0,0.0,49.0,53.1000,55.0,2.0
4,5.0,0.0,3.0,15.0,1.0,35.0,0.0,0.0,472.0,8.0500,147.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887.0,0.0,2.0,548.0,1.0,27.0,0.0,0.0,101.0,13.0000,147.0,2.0
887,888.0,1.0,1.0,303.0,0.0,19.0,0.0,0.0,14.0,30.0000,30.0,2.0
888,889.0,0.0,3.0,413.0,0.0,,1.0,2.0,675.0,23.4500,147.0,2.0
889,890.0,1.0,1.0,81.0,1.0,26.0,0.0,0.0,8.0,30.0000,60.0,0.0


In [55]:
df_data["Sex"] = df_data["Age"].astype("category")
df_data["Sex"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Sex, Length: 891, dtype: category
Categories (88, float64): [0.42, 0.67, 0.75, 0.83, ..., 70.5, 71.0, 74.0, 80.0]

In [56]:
df_data.groupby("Age", observed=False).size()

Age
0.42     1
0.67     1
0.75     2
0.83     2
0.92     1
        ..
70.00    2
70.50    1
71.00    2
74.00    1
80.00    1
Length: 88, dtype: int64