In [9]:
import pandas as pd

In [51]:
df = pd.read_csv('preprocessing.csv')
print(df.head(10))

         Date Category  Value   Product  Sales Region
0  2023-12-28        A    NaN  Product2  168.0   West
1  2023-03-15        A   30.4  Product3  193.3   East
2  2024-01-10        A   11.8  Product2  103.8   West
3  2023-06-05        C   16.1  Product4  119.1  North
4  2023-04-15        C   32.7  Product4  118.1   East
5  2024-01-30        C   18.5  Product1  200.2  South
6  2024-01-13        B    NaN  Product2  198.8   West
7  2023-05-05        B   15.7  Product3  192.7   East
8  2023-03-10        A    NaN  Product2  235.6  North
9  2024-03-26        C    NaN  Product2  190.1   East


In [11]:
# print statistical summary
print(df.describe())

            Value       Sales
count  450.000000  450.000000
mean    22.497778  171.452222
std      7.514988   42.376087
min     10.100000  101.600000
25%     16.025000  134.475000
50%     22.800000  169.200000
75%     28.900000  206.775000
max     34.800000  250.000000


In [12]:
# print datatypes of each column
print(df.dtypes)

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object


In [47]:
# handling missing values :-

# df.isnull() / df.isna() method : 
#       returns an boolean array specifying whether there is a missing value or not
print(df.isna())
print(df.isna().sum())          # take sum of all the records having missing values

# df.isnotnull / df.isnotna() method : 
#       opposite of above

      Date  Category  Value  Product  Sales  Region
0    False     False   True    False  False   False
1    False     False  False    False  False   False
2    False     False  False    False  False   False
3    False     False  False    False  False   False
4    False     False  False    False  False   False
..     ...       ...    ...      ...    ...     ...
495  False     False  False    False  False   False
496  False     False   True    False  False   False
497  False     False  False    False  False   False
498  False     False  False    False  False   False
499  False     False  False    False   True   False

[500 rows x 6 columns]
Date         0
Category     0
Value       50
Product      0
Sales       50
Region       0
dtype: int64


In [52]:
# df.fillna(val) method :
#       replaces all the null values with given value in dataframe
filled_0_df = df.fillna(0)
print(filled_0_df.isna().sum())
print()
print(filled_0_df.head(10))
print()

filled_mean_df = df                     # functional operations in dataframe always return copy of dataframe
filled_mean_df['Value'] = filled_mean_df['Value'].fillna(filled_mean_df['Value'].mean())    # fill the na values in value column with its mean
print(filled_mean_df.isna().sum())
print()
print(filled_mean_df.head(10))

Date        0
Category    0
Value       0
Product     0
Sales       0
Region      0
dtype: int64

         Date Category  Value   Product  Sales Region
0  2023-12-28        A    0.0  Product2  168.0   West
1  2023-03-15        A   30.4  Product3  193.3   East
2  2024-01-10        A   11.8  Product2  103.8   West
3  2023-06-05        C   16.1  Product4  119.1  North
4  2023-04-15        C   32.7  Product4  118.1   East
5  2024-01-30        C   18.5  Product1  200.2  South
6  2024-01-13        B    0.0  Product2  198.8   West
7  2023-05-05        B   15.7  Product3  192.7   East
8  2023-03-10        A    0.0  Product2  235.6  North
9  2024-03-26        C    0.0  Product2  190.1   East

Date         0
Category     0
Value        0
Product      0
Sales       50
Region       0
dtype: int64

         Date Category      Value   Product  Sales Region
0  2023-12-28        A  22.497778  Product2  168.0   West
1  2023-03-15        A  30.400000  Product3  193.3   East
2  2024-01-10        A  11.80

In [None]:
# Renaming row/column labels        [axis = 0 for rows axis = 1 for columns]
# Syntax :- df.rename(dict{oldname: newname, }, axis)

renamed_col_df = df.rename({'Date': 'Sale_Date', 'Value': 'Price'}, axis=1)
print(renamed_col_df.head(5))
print()

renamed_row_df = df.rename({0: 'first', 1: 'second'})
print(renamed_row_df.head(5))
print()

print(df.head(5))           # original dataframe remains unchanged


    Sale_Date Category      Price   Product  Sales Region
0  2023-12-28        A  22.497778  Product2  168.0   West
1  2023-03-15        A  30.400000  Product3  193.3   East
2  2024-01-10        A  11.800000  Product2  103.8   West
3  2023-06-05        C  16.100000  Product4  119.1  North
4  2023-04-15        C  32.700000  Product4  118.1   East

              Date Category      Value   Product  Sales Region
first   2023-12-28        A  22.497778  Product2  168.0   West
second  2023-03-15        A  30.400000  Product3  193.3   East
2       2024-01-10        A  11.800000  Product2  103.8   West
3       2023-06-05        C  16.100000  Product4  119.1  North
4       2023-04-15        C  32.700000  Product4  118.1   East

         Date Category      Value   Product  Sales Region
0  2023-12-28        A  22.497778  Product2  168.0   West
1  2023-03-15        A  30.400000  Product3  193.3   East
2  2024-01-10        A  11.800000  Product2  103.8   West
3  2023-06-05        C  16.100000  Produ

In [54]:
# changing datatypes
# use .astype(datatype) method

df['Value_As_Int'] = df['Value'].astype(int)
print(df.head(10))

         Date Category      Value   Product  Sales Region  Hiked_Value  \
0  2023-12-28        A  22.497778  Product2  168.0   West    44.995556   
1  2023-03-15        A  30.400000  Product3  193.3   East    60.800000   
2  2024-01-10        A  11.800000  Product2  103.8   West    23.600000   
3  2023-06-05        C  16.100000  Product4  119.1  North    32.200000   
4  2023-04-15        C  32.700000  Product4  118.1   East    65.400000   
5  2024-01-30        C  18.500000  Product1  200.2  South    37.000000   
6  2024-01-13        B  22.497778  Product2  198.8   West    44.995556   
7  2023-05-05        B  15.700000  Product3  192.7   East    31.400000   
8  2023-03-10        A  22.497778  Product2  235.6  North    44.995556   
9  2024-03-26        C  22.497778  Product2  190.1   East    44.995556   

   Value_As_Int  
0            22  
1            30  
2            11  
3            16  
4            32  
5            18  
6            22  
7            15  
8            22  
9    

In [None]:
# apply a function on a column using apply(function) method

df['Hiked_Value'] = df['Value'].apply(lambda x:x*2)
print(df.head(5))

         Date Category      Value   Product  Sales Region  Hiked_Value  \
0  2023-12-28        A  22.497778  Product2  168.0   West    44.995556   
1  2023-03-15        A  30.400000  Product3  193.3   East    60.800000   
2  2024-01-10        A  11.800000  Product2  103.8   West    23.600000   
3  2023-06-05        C  16.100000  Product4  119.1  North    32.200000   
4  2023-04-15        C  32.700000  Product4  118.1   East    65.400000   

   Value_As_Int  
0            22  
1            30  
2            11  
3            16  
4            32  


In [None]:
# data aggregating using agg() method and grouping using groupby() method
# Syntax :- df.groupby(list_of_dimensions)[list_of_measures].agg(list_of_aggregate_functions)

# Note :- inside agg, we can also give custom defined functions

df = pd.read_csv('preprocessing.csv')
df.fillna(df.mean(numeric_only=True), inplace=True)     # handling missing values
print(df.head(5))

print(df.groupby(['Product', 'Region'])[['Value', 'Sales']].agg(['sum', 'mean']))

         Date Category      Value   Product  Sales Region
0  2023-12-28        A  22.497778  Product2  168.0   West
1  2023-03-15        A  30.400000  Product3  193.3   East
2  2024-01-10        A  11.800000  Product2  103.8   West
3  2023-06-05        C  16.100000  Product4  119.1  North
4  2023-04-15        C  32.700000  Product4  118.1   East
                      Value                   Sales            
                        sum       mean          sum        mean
Product  Region                                                
Product1 East    744.495556  24.015986  5482.613333  176.858495
         North   769.091111  21.974032  6430.004444  183.714413
         South   535.191111  24.326869  3586.800000  163.036364
         West    460.497778  21.928466  3602.204444  171.533545
Product2 East    632.188889  23.414403  4944.552222  183.131564
         North   583.891111  22.457350  4690.756667  180.413718
         South   580.495556  22.326752  4318.308889  166.088803
         Wes

In [74]:
# Joining Dataframes using merge(dframes, on, how) method

df1 = pd.DataFrame({
    'key': [1, 2, 3, 4],
    'val1': [10, 20, 30, 40]
})

df2 = pd.DataFrame({
    'key': [3, 4, 5, 6],
    'val2': [9, 16, 25, 36]
})

print(df1, end="\n\n")
print(df2)

   key  val1
0    1    10
1    2    20
2    3    30
3    4    40

   key  val2
0    3     9
1    4    16
2    5    25
3    6    36


In [78]:
# performing inner join
print(pd.merge(df1, df2, on="key", how="inner"), end="\n\n")

# performing full outer join
print(pd.merge(df1, df2, on="key", how="outer"), end="\n\n")

# performing left outer join
print(pd.merge(df1, df2, on="key", how="left"), end="\n\n")

# performing right outer join
print(pd.merge(df1, df2, on="key", how="right"), end="\n\n")

# performing cross join
print(pd.merge(df1, df2, how="cross"), end="\n\n")

   key  val1  val2
0    3    30     9
1    4    40    16

   key  val1  val2
0    1  10.0   NaN
1    2  20.0   NaN
2    3  30.0   9.0
3    4  40.0  16.0
4    5   NaN  25.0
5    6   NaN  36.0

   key  val1  val2
0    1    10   NaN
1    2    20   NaN
2    3    30   9.0
3    4    40  16.0

   key  val1  val2
0    3  30.0     9
1    4  40.0    16
2    5   NaN    25
3    6   NaN    36

    key_x  val1  key_y  val2
0       1    10      3     9
1       1    10      4    16
2       1    10      5    25
3       1    10      6    36
4       2    20      3     9
5       2    20      4    16
6       2    20      5    25
7       2    20      6    36
8       3    30      3     9
9       3    30      4    16
10      3    30      5    25
11      3    30      6    36
12      4    40      3     9
13      4    40      4    16
14      4    40      5    25
15      4    40      6    36

