# Loading the dataset into a Pandas DataFrame.

In [1]:
import pandas as pd

file_path = "C:/Users/patel/Downloads/Weekly_Assignment.csv"

data = pd.read_csv(file_path)

df = pd.DataFrame(data)

print(df)

    User_ID   Age  Gender     Location  Followers  Following   Posts   Likes  \
0         1  25.0    Male     New York      500.0      300.0   100.0   250.0   
1         2  32.0  Female  Los Angeles     1000.0      700.0   150.0   500.0   
2         3   NaN    Male       London        NaN      500.0     NaN   150.0   
3         4  42.0  Female        Paris     2500.0     1000.0   300.0   750.0   
4         5   NaN   Other          NaN        NaN        NaN     NaN  1000.0   
5         6  29.0    Male       Sydney     1500.0        NaN   200.0   400.0   
6         7  37.0     NaN        Tokyo      800.0      600.0     NaN     NaN   
7         8  28.0  Female       Berlin        NaN        NaN   100.0   300.0   
8         9  35.0   Other       Moscow     4000.0     1200.0   400.0   900.0   
9        10   NaN    Male      Chicago        NaN        NaN     NaN   600.0   
10       11  65.0  Female  Los Angeles    10000.0     5000.0  1000.0  1500.0   
11       12  22.0    Male     New York  

# Checking the missing values and ratio

In [2]:
missing_values = df.isnull().sum()

# Print the count of missing values

print("Missing Values:")

print(missing_values)

# Calculate the missing value ratio

missing_ratio = df.isnull().mean()

# Print the missing value ratio

print("Missing Value Ratio:")

print(missing_ratio)

Missing Values:
User_ID          0
Age              3
Gender           1
Location         1
Followers        4
Following        4
Posts            4
Likes            1
Comments         4
Shares           2
Small_Numbers    2
dtype: int64
Missing Value Ratio:
User_ID          0.000000
Age              0.250000
Gender           0.083333
Location         0.083333
Followers        0.333333
Following        0.333333
Posts            0.333333
Likes            0.083333
Comments         0.333333
Shares           0.166667
Small_Numbers    0.166667
dtype: float64


# Handling Missing Values

In [3]:
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Followers'].fillna(df['Followers'].mean(),inplace=True)
df['Following'].fillna(df['Following'].mean(),inplace=True)
df['Posts'].fillna(df['Posts'].mean(),inplace=True)
df['Likes'].fillna(df['Likes'].mean(),inplace=True)
df['Comments'].fillna(df['Comments'].mean(),inplace=True)
df['Shares'].fillna(df['Shares'].mean(),inplace=True)
df['Small_Numbers'].fillna(df['Small_Numbers'].mean(),inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0],inplace=True)
df['Location'].fillna(df['Location'].mode()[0],inplace=True)
print(df)

    User_ID   Age  Gender     Location  Followers  Following    Posts   Likes  \
0         1  25.0    Male     New York      500.0      300.0   100.00   250.0   
1         2  32.0  Female  Los Angeles     1000.0      700.0   150.00   500.0   
2         3  35.0    Male       London     2637.5      500.0   343.75   150.0   
3         4  42.0  Female        Paris     2500.0     1000.0   300.00   750.0   
4         5  35.0   Other  Los Angeles     2637.5     1275.0   343.75  1000.0   
5         6  29.0    Male       Sydney     1500.0     1275.0   200.00   400.0   
6         7  37.0    Male        Tokyo      800.0      600.0   343.75   650.0   
7         8  28.0  Female       Berlin     2637.5     1275.0   100.00   300.0   
8         9  35.0   Other       Moscow     4000.0     1200.0   400.00   900.0   
9        10  35.0    Male      Chicago     2637.5     1275.0   343.75   600.0   
10       11  65.0  Female  Los Angeles    10000.0     5000.0  1000.00  1500.0   
11       12  22.0    Male   

# Identifying and Removing Outliers

### Using Modified Z-score

In [4]:
import pandas as pd

import numpy as np

from scipy.stats import mstats

columns = ['Age', 'Followers','Following','Posts','Likes','Comments','Shares','Small_Numbers']  

# Replace outliers with a threshold value (e.g., 3 standard deviations)

threshold = 3

for column in columns:
    
    data=df[column]
    
    median = np.median(data)

    mad = np.median(np.abs(data - median))
    
    # Calculate modified Z-scores

    #The value 0.6745 is a constant factor used in the modified Z-score calculation. It is derived from the inverse of the cumulative distribution function (CDF) of the standard normal distribution at a specific percentile.

    modified_z_scores = 0.6745 * (data - median) / mad

    # Find outliers

    outliers = np.where(np.abs(modified_z_scores) > threshold)

    # Print the outliers

    print("Outliers for " + column + ':')
    for index in outliers[0]:
        print(data[index])


Outliers for Age:
65.0
Outliers for Followers:
10000.0
Outliers for Following:
5000.0
Outliers for Posts:
1000.0
Outliers for Likes:
Outliers for Comments:
300.0
Outliers for Shares:
500.0
100.0
Outliers for Small_Numbers:
20000.0
3000.003100000001
3000.003100000001
10000.0


###  Using winsorization method

In [5]:
 # Apply winsorization
columns_with_outliers = ['Age', 'Followers','Following','Posts','Comments','Shares']  

for column in columns_with_outliers:
    data=df[column]
    winsorized_data = mstats.winsorize(data, limits=[0.10, 0.10])
    data = winsorized_data.astype(float)
    df[column] = data

### Using Transformation method 

In [6]:

data = df['Small_Numbers']

# Apply logarithmic transformation

transformed_data = np.log1p(data)

df['Small_Numbers']= transformed_data

# Saving the Pre-processed data into Excel

In [7]:
print(df)

# Save the modified DataFrame to an Excel file

df.to_excel('Final_Workbook.xlsx', index=False)

    User_ID   Age  Gender     Location  Followers  Following   Posts   Likes  \
0         1  25.0    Male     New York      800.0      500.0  100.00   250.0   
1         2  32.0  Female  Los Angeles     1000.0      700.0  150.00   500.0   
2         3  35.0    Male       London     2637.5      500.0  343.75   150.0   
3         4  42.0  Female        Paris     2500.0     1000.0  300.00   750.0   
4         5  35.0   Other  Los Angeles     2637.5     1275.0  343.75  1000.0   
5         6  29.0    Male       Sydney     1500.0     1275.0  200.00   400.0   
6         7  37.0    Male        Tokyo      800.0      600.0  343.75   650.0   
7         8  28.0  Female       Berlin     2637.5     1275.0  100.00   300.0   
8         9  35.0   Other       Moscow     4000.0     1200.0  400.00   900.0   
9        10  35.0    Male      Chicago     2637.5     1275.0  343.75   600.0   
10       11  42.0  Female  Los Angeles     4000.0     1275.0  500.00  1500.0   
11       12  25.0    Male     New York  