In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Load sample data
data = {'Name': ['Alice', 'Bob', None, 'David' ,'Jhon' , 'Leo'],
        'Age': [24, 27, 22, None , '31'  , None],
        'Salary': [50000, 60000, 55000, 45000 , None , 75000]}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age,Salary
0,Alice,24.0,50000.0
1,Bob,27.0,60000.0
2,,22.0,55000.0
3,David,,45000.0
4,Jhon,31.0,


In [3]:
# Drop rows with missing values
df_clean = df.dropna()

# Calculate average salary
avg_salary = df_clean['Salary'].mean()

print("Cleaned Data:")
print(f"{df_clean} \n ")
print(f"Average Salary: {avg_salary}")

Cleaned Data:
    Name Age   Salary
0  Alice  24  50000.0
1    Bob  27  60000.0 
 
Average Salary: 55000.0


**File Handling**

In [4]:
# Save DataFrame to a CSV file
df_clean.to_csv('clean_data.csv' , index = False)

In [5]:
# Load the saved CSV file
load_df = pd.read_csv('clean_data.csv')
print("Loaded Data:")
print(load_df)

Loaded Data:
    Name  Age   Salary
0  Alice   24  50000.0
1    Bob   27  60000.0


**Data Aggregation and Grouping**

In [6]:
data = {'Category': ['A', 'B', 'A', 'B', 'C' , 'C'],
        'Values': [10, 20, 30, 40, 50 , None]}

df = pd.DataFrame(data)
df

Unnamed: 0,Category,Values
0,A,10.0
1,B,20.0
2,A,30.0
3,B,40.0
4,C,50.0
5,C,


In [7]:
# Group by Category and sum Values
df_group = df.groupby('Category')
g_sum = df_group.sum()
g_mean = df_group.mean()
g_count = df_group.count()
g_size= df_group.size()


print("Grouped Data Sum:")
print(g_sum)
print("Grouped Data Mean:")
print(g_mean)
print("Grouped Data Count:")
print(g_count)
print("Grouped Data Size:")
print(g_size)

Grouped Data Sum:
          Values
Category        
A           40.0
B           60.0
C           50.0
Grouped Data Mean:
          Values
Category        
A           20.0
B           30.0
C           50.0
Grouped Data Count:
          Values
Category        
A              2
B              2
C              1
Grouped Data Size:
Category
A    2
B    2
C    2
dtype: int64


**Exception Handling**

In [8]:
#read a non-existing file and handle the Erroe 
try:
    df = pd.read_csv('no_file')
    print("file found")
except FileNotFoundError as e:
    print(f'There is an error: {e}')

There is an error: [Errno 2] No such file or directory: 'no_file'


**Working with Dates**<br>
Example: Convert a column to datetime and filter by date range

In [9]:
data = {'Date': ['2024-01-01', '2024-02-15', '2024-03-10'],
        'Values': [10, 20, 30]}
df_date = pd.DataFrame(data)
df_date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    3 non-null      object
 1   Values  3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [10]:
df_date['Date'] = pd.to_datetime(df_date['Date'])
df_date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3 non-null      datetime64[ns]
 1   Values  3 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 176.0 bytes
