In [3]:
# Detect & Remove Outliers using IQR Method

# Objective: Learn to identify and remove outliers from a dataset using the Interquartile Range (IQR) method.
# Instructions:
# For each example, perform the following steps:
#     1. Load the Dataset: Load the dataset into your environment. You can use pandas to read the CSV file.
#     2. Calculate IQR: Calculate the first quartile (Q1), third quartile (Q3), and the IQR for the specified column.
#     3. Identify Outliers: Determine which data points are considered outliers.
#     4. Remove Outliers: Remove the outliers from the dataset.
#     5. Verify: Ensure the outliers are removed by checking the size or summary statistics of the dataset before and after the removal.
    
import pandas as pd

# Create the sample dataset
data = {
    'Store': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Monthly_Sales': [1500, 2000, 1800, 3000, 40000, 2500, 2200, 2100, 1900, 2300]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('sales_data.csv', index=False)

print("CSV file 'sales_data.csv' created successfully.")

    

# Task:
#     Dataset: sales_data.csv(get it by your own it includes the column of Monthly_Sales)
#     Column to analyze: Monthly_Sales
#     Steps:
#         1. Load sales_data.csv .
#         2. Calculate Q1, Q3, and IQR for Monthly_Sales .
#         3. Identify outliers.
#         4. Remove the outliers.
#         5. Check the number of rows removed.



import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv('sales_data.csv')

# Display the first few rows to inspect the dataset
print("Original DataFrame:")
print(df.head())

# Step 2: Calculate Q1, Q3, and IQR for Monthly_Sales
Q1 = df['Monthly_Sales'].quantile(0.25)
Q3 = df['Monthly_Sales'].quantile(0.75)
IQR = Q3 - Q1

# Step 3: Identify outliers using the IQR method
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 4: Filter out the outliers
df_cleaned = df[(df['Monthly_Sales'] >= lower_bound) & (df['Monthly_Sales'] <= upper_bound)]

# Step 5: Verify the number of rows removed
rows_removed = len(df) - len(df_cleaned)
print("\nNumber of rows removed:", rows_removed)

# Display the cleaned DataFrame
print("\nCleaned DataFrame (first 5 rows):")
print(df_cleaned.head())

# Optionally: Show summary statistics of the original and cleaned dataset
print("\nOriginal DataFrame Summary:")
print(df['Monthly_Sales'].describe())

print("\nCleaned DataFrame Summary:")
print(df_cleaned['Monthly_Sales'].describe())




CSV file 'sales_data.csv' created successfully.
Original DataFrame:
   Store  Monthly_Sales
0      1           1500
1      2           2000
2      3           1800
3      4           3000
4      5          40000

Number of rows removed: 1

Cleaned DataFrame (first 5 rows):
   Store  Monthly_Sales
0      1           1500
1      2           2000
2      3           1800
3      4           3000
5      6           2500

Original DataFrame Summary:
count       10.000000
mean      5930.000000
std      11977.947329
min       1500.000000
25%       1925.000000
50%       2150.000000
75%       2450.000000
max      40000.000000
Name: Monthly_Sales, dtype: float64

Cleaned DataFrame Summary:
count       9.000000
mean     2144.444444
std       433.333333
min      1500.000000
25%      1900.000000
50%      2100.000000
75%      2300.000000
max      3000.000000
Name: Monthly_Sales, dtype: float64
