<a href="https://colab.research.google.com/github/MekalaHarshitha2312/Internship-Projects/blob/main/Data_Analytics_Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Load the dataset
train_data = pd.read_csv('train.csv')

#Display the first few rows of the dataset
print("First few rows of the training data:")
print(train_data.head(), '\n')

First few rows of the training data:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0          

In [None]:
#Check for missing values
missing_values = train_data.isnull().sum()
print("Missing values in each column:")
print(missing_values[missing_values > 0], '\n')

Missing values in each column:
Age         177
Cabin       687
Embarked      2
dtype: int64 



In [None]:
#Fill missing Age with the median
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())

#Drop rows with missing Fare values (if any)
train_data = train_data.dropna(subset=['Fare'])

#Verify missing values after handling
print("Missing values after handling:")
print(train_data.isnull().sum(), '\n')

Missing values after handling:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64 



In [None]:
#Function to detect outliers using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] < lower_bound) | (data[column] > upper_bound)]

#Identify outliers for Age and Fare
age_outliers = detect_outliers_iqr(train_data, 'Age')
fare_outliers = detect_outliers_iqr(train_data, 'Fare')

#Print the number of outliers
print(f"Number of outliers in Age: {len(age_outliers)}")
print(f"Number of outliers in Fare: {len(fare_outliers)}\n")

#Remove outliers from the dataset
train_data = train_data[~train_data['Age'].isin(age_outliers['Age'])]
train_data = train_data[~train_data['Fare'].isin(fare_outliers['Fare'])]

#Verify the shape of the cleaned dataset
print(f"Shape of dataset after removing outliers: {train_data.shape}\n")

Number of outliers in Age: 66
Number of outliers in Fare: 116

Shape of dataset after removing outliers: (721, 12)



In [None]:
#Final check for missing values and outliers
print("Remaining missing values in each column:")
print(train_data.isnull().sum(), '\n')

remaining_age_outliers = detect_outliers_iqr(train_data, 'Age')
remaining_fare_outliers = detect_outliers_iqr(train_data, 'Fare')
print(f"Remaining number of outliers in Age: {len(remaining_age_outliers)}")
print(f"Remaining number of outliers in Fare: {len(remaining_fare_outliers)}\n")

Remaining missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          623
Embarked         0
dtype: int64 

Remaining number of outliers in Age: 42
Remaining number of outliers in Fare: 24



In [None]:
#Calculate Summary Statistics
columns_to_analyze = ['Age', 'Fare', 'SibSp', 'Parch']

In [None]:
#Initialize a dictionary to store summary statistics
summary_statistics = {}

In [None]:
#Calculate statistics for each column
for column in columns_to_analyze:
    if column in train_data.columns:
        mean_value = train_data[column].mean()
        median_value = train_data[column].median()
        mode_value = train_data[column].mode()[0]  #mode() returns a Series
        std_dev_value = train_data[column].std()

        #Store the statistics in the dictionary
        summary_statistics[column] = {
            'Mean': mean_value,
            'Median': median_value,
            'Mode': mode_value,
            'Standard Deviation': std_dev_value
        }

#Display the summary statistics
for column, stats in summary_statistics.items():
    print(f"Summary Statistics for {column}:")
    print(f"Mean: {stats['Mean']}")
    print(f"Median: {stats['Median']}")
    print(f"Mode: {stats['Mode']}")
    print(f"Standard Deviation: {stats['Standard Deviation']}\n")

Summary Statistics for Age:
Mean: 28.094313453536756
Median: 28.0
Mode: 28.0
Standard Deviation: 10.021961072035158

Summary Statistics for Fare:
Mean: 17.389845492371705
Median: 12.275
Mode: 8.05
Standard Deviation: 13.563036171819876

Summary Statistics for SibSp:
Mean: 0.4147018030513176
Median: 0.0
Mode: 0
Standard Deviation: 0.8539159476653936

Summary Statistics for Parch:
Mean: 0.3231622746185853
Median: 0.0
Mode: 0
Standard Deviation: 0.7885492921373876

