In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import RandomOverSampler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
space_track_data = pd.read_csv('space_track_final_data.csv')
ucs_satellite = pd.read_excel('UCS-Satellite-Database.xlsx')

In [None]:
# List of columns to check
columns_to_check = ['INTLDES', 'NORAD_CAT_ID', 'OBJECT_TYPE', 'SATNAME', 'COUNTRY', 'LAUNCH']

# Check data types of the columns
column_types = {column: space_track_data[column].dtype for column in columns_to_check if column in space_track_data.columns}

print(column_types)

In [None]:
space_track_data.describe

In [None]:
ucs_col_names = ucs_satellite.columns

ucs_satellite = ucs_satellite[ucs_satellite.columns.drop(list(ucs_satellite.filter(regex='Unnamed:')))]
ucs_satellite.head()

In [None]:
columns_to_check = ['NORAD Number', 'Current Official Name of Satellite', 'Country of Operator/Owner', 'Users', 'Class of Orbit']

# Check data types of the columns
column_types = {column: ucs_satellite[column].dtype for column in columns_to_check if column in ucs_satellite.columns}

print(column_types)

In [None]:
def summary_stats(df):
    
    summary = df.describe()
    missing_vals = df.isna().sum()
    
    return summary,missing_vals

In [None]:
summary_space_track,missing_space_track = summary_stats(space_track_data)
print(summary_space_track)
print(missing_space_track)
len(space_track_data)

In [None]:
summary_ucs, missing_ucs = summary_stats(ucs_satellite)
print(summary_ucs)
print(missing_ucs)
len(ucs_satellite)

## Data Preprocessing

In [None]:
def preprocessing(df,c_drop,c_fill,c_char,c_dt,impute='mean'):
    
    #Dropping columns
    df = df.drop(columns = c_drop, errors = 'ignore')
    
    
    # Filling missing values
    for c in c_fill:
        if impute == 'mean':
            df[c].fillna(df[c].mean(),inplace = True)
        elif impute == 'median':
            df[c].fillna(df[c].median(),inplace = True)
    
    #Converting data to date and time format
    for t in c_dt:
        df[t] = pd.to_datetime(df[t],errors = 'coerce')
    
    
    #Converting data to category
    for char in c_char:
        df[char] = df[char].astype('category')
        
               
    
    return df

In [None]:
space_track_cols_to_drop = ['COMMENT', 'COMMENTCODE', 'RCS_SIZE'] 
space_track_cols_to_impute = ['PERIOD', 'INCLINATION', 'APOGEE', 'PERIGEE']
space_track_date_columns = ['LAUNCH', 'DECAY']
space_track_categorical_columns = ['OBJECT_TYPE', 'COUNTRY', 'SITE']

space_track_data = preprocessing(space_track_data,space_track_cols_to_drop,space_track_cols_to_impute,space_track_categorical_columns,space_track_date_columns,)

ucs_cols_to_drop = ['Detailed Purpose', 'Type of Orbit', 'Dry Mass (kg.)', 'Power (watts)', 'Comments'] + ['Source', 'Source.1', 'Source.2', 'Source.3', 'Source.4', 'Source.5', 'Source.6']
ucs_cols_to_impute = ['Expected Lifetime (yrs.)', 'Longitude of GEO (degrees)', 'Period (minutes)']
ucs_date_columns = ['Date of Launch']
ucs_categorical_columns = ['Class of Orbit', 'Purpose', 'Users']

ucs_satellite = preprocessing(ucs_satellite,ucs_cols_to_drop,ucs_cols_to_impute,ucs_categorical_columns,ucs_date_columns,)
ucs_satellite = ucs_satellite[ucs_satellite['Inclination (degrees)'] != 'USA']
ucs_satellite['Inclination (degrees)'] = ucs_satellite['Inclination (degrees)'].astype('int64')

In [None]:
def quality_and_integrity_check(df, dataset_name):
    print(f"--- Quality and Integrity Check for {dataset_name} Dataset ---")
    
    # Check for null values
    total_nulls = df.isnull().sum().sum()
    print(f"Total null values: {total_nulls}")
    
    # Check data types
    print("\nData Types:")
    print(df.dtypes)

    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicates}")

    # Display basic statistics for numerical columns
    print("\nBasic Statistics for Numerical Columns:")
    print(df.describe())

    # Display first few rows of the dataset
    print("\nFirst few rows of the dataset:")
    print(df.head())
    
    print("\n------------------------------------------------\n")

# Perform checks on each dataset
quality_and_integrity_check(space_track_data, "Space-Track")
quality_and_integrity_check(reentry_history, "Re-entry History")
quality_and_integrity_check(ucs_satellite, "UCS Satellite")


## EDA

In [None]:
# Definfing a function to perform Exploratory Data Analysis

def eda(df,name):
    
    print(f"Performing EDA on {name}")
    
    # Summary Stats
    print("\n Descriptive Stats")
    print(df.describe())
    
    #Pair plot for all the numeric columns
    
    sns.pairplot(df.select_dtypes(include = ['float64','int64']))
    plt.title(f"Pair plot for Numeric Data")
    plt.show()
    
    
    # Correlation Heatmap for numerical features only
    numeric_df = df.select_dtypes(include=['float64', 'int64'])
    if not numeric_df.empty:
        plt.figure(figsize=(10, 8))
        sns.heatmap(numeric_df.corr(), annot=True, fmt=".2f")
        plt.title(f'Correlation Heatmap of {name}')
        plt.show()
    
    
    # Histograms for all numerical features
    
    df.hist(bins=15, figsize=(15, 10), layout=(5, 3))
    plt.suptitle(f'Histograms of Numerical Features in {name}')
    plt.show()
    
    # Boxplots for all numerical features
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        plt.figure(figsize=(7, 4))
        sns.boxplot(x=df[column])
        plt.title(f'Boxplot of {column} in {name}')
        plt.show()

    # Bar plots for categorical features
    for column in df.select_dtypes(include=['category', 'object']).columns:
        plt.figure(figsize=(7, 4))
        df[column].value_counts().plot(kind='bar')
        plt.title(f'Bar Plot of {column} in {name}')
        plt.show()
        
    
    # Time Series Analysis 
    time_columns = df.select_dtypes(include=['datetime']).columns
    for col in time_columns:
        plt.figure(figsize=(10, 4))
        df[col].value_counts().sort_index().plot(kind='line')
        plt.title(f'Time Series Plot of {col} in {name}')
        plt.show()
        
    
    print("\nMissing Values Analysis:")
    print(df.isnull().sum())
    
    
    print("\nSkewness and Kurtosis:")
    print(df.select_dtypes(include=['float64', 'int64']).agg(['skew', 'kurtosis']).transpose())

    for column in df.select_dtypes(include=['category', 'object']).columns:
        print(f'\nFrequency Distribution for {column}:')
        print(df[column].value_counts())


In [None]:
space_track_sample = space_track_data.sample(frac = 0.001)
df_name = 'Space Track'
len(space_track_sample)
eda(space_track_sample,df_name)

In [None]:
# Replace 'LEo' with 'LEO' in the 'Class of Orbit' column
ucs_satellite['Class of Orbit'] = ucs_satellite['Class of Orbit'].replace('LEo', 'LEO')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Commercial ', 'Commercial')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Government ', 'Government')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Military ', 'Military')

ucs_satellite['Users'] = ucs_satellite['Users'].replace('Civil/Commercial', 'Commercial/Civil')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Commercial/Government', 'Government/Commercial')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Commercial/Military', 'Military/Commercial')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Civil/Military', 'Military/Civil')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Civil/Government', 'Government/Civil')
ucs_satellite['Users'] = ucs_satellite['Users'].replace('Government/Military', 'Military/Government')

In [None]:
ucs_sample = ucs_satellite.sample(frac = 0.01)
ucs_name = 'UCS Satellite'
len(ucs_sample)
eda(ucs_sample,ucs_name)