In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import datetime 

df = pd.read_csv('/Users/ladipo/Desktop/Charter/charter_pricepred/data/data/data_generation.csv')
df.head()

KeyboardInterrupt: 

In [None]:
df.iloc[:, [0, 1, 2, 3]]

In [None]:
df.isnull().sum()

In [None]:
#Made all values lower case and replaced space with underscore for smooth reading
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

In [None]:
df['vessel_type'].value_counts()

In [None]:
# Handling Nan by filling with 0. All columns useful so no dropping
df = df.fillna(0)
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
for col in df:
    print(df.duplicated().sum())
    print(df.shape)
    print(df.isnull().sum())

In [None]:
for col in df.columns:
    print(f"Column: {col}")
    print(f"Number of duplicates in column '{col}': {df[col].duplicated().sum()}")
    print(f"Shape of DataFrame: {df.shape}")
    print(f"Number of null values in column '{col}': {df[col].isnull().sum()}")
    print("-" * 40)


### Exploratory Data Analysis 

### Vessel Type, Category and Cargo Type

In [None]:
cat_var = ['vessel_type', 'size_category', 'cargo_type']

for col in cat_var:
    plt.figure(figsize = (13, 5))
    sns.countplot(x = df[col])
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation = 45)
    plt.show()

In [None]:
for col in ['vessel_type', 'size_category', 'cargo_type']:
    print(df.groupby(col)['charter_price_($/day)'].describe())

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data = df, x = 'vessel_type' , y = 'charter_price_($/day)')
plt.title('Charter by vessel type')
plt.show()

### Date Analysis and Time Series plot

In [None]:
#Sorting date out 

df['charter_date'] = pd.to_datetime(df['charter_date'])
df['year'] = df['charter_date'].dt.year
df['month'] = df['charter_date'].dt.month
df['day'] = df['charter_date'].dt.day


In [None]:
df['vessel_type'].unique()

In [None]:
df.head()

In [None]:
import matplotlib.dates as mdates

plt.figure(figsize = (14,6))
df.groupby('month')['charter_price_($/day)'].mean().plot()
plt.title('Price over Time')

plt.show()

In [None]:
df['vessel_type'].unique()

### Capacity and Dimensions (length(m)) Analysis

In [None]:
# Histograms and Boxplots for cargo capacity, container capacity, vessel length, and vessel breadth
numerical_cols_1 = ['cargo_capacity_(dwt)', 'container_capacity_(teu)', 'vessel_length_(m)']
for col in numerical_cols_1:

    filtered =  df[df[col] > 0]

    plt.figure(figsize=(10, 5))
    sns.histplot(filtered[col], bins=20, kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(10, 5))
    sns.boxplot(x=filtered[col], showfliers = False)
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

    # Scatter Plots for relationships with charter price
    plt.figure(figsize=(10, 5))
    sns.scatterplot(data=filtered, x=col, y='charter_price_($/day)')
    plt.title(f'{col} vs Charter Price')
    plt.xlabel(col)
    plt.ylabel('Charter Price ($/day)')
    plt.show()


In [None]:
# Column names being observed 
numerical_cols_1 = ['cargo_capacity_(dwt)', 'container_capacity_(teu)', 'vessel_length_(m)']
numerical_cols_2 = ['charter_price_($/day)', 'duration_(days)', 'fuel_cost_($/liter)']
numerical_cols_3 = ['lng_capacity_(m)', 'lpg_capacity_(m)']

### Charter Price, Duration, Fuel Cost Analysis

In [None]:
# Histograms and Boxplots for charter price, duration, and fuel cost
numerical_cols_2 = ['charter_price_($/day)', 'duration_(days)', 'fuel_cost_($/liter)']
for col in numerical_cols_2:
    plt.figure(figsize=(10, 5))
    sns.histplot(df[col], bins=20, kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[col], showfliers = False)
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

# # Correlation Analysis
# plt.figure(figsize=(12, 8))
# sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
# plt.title('Correlation Heatmap')
# plt.show()


### LNG & LPG Capacity Analysis 

In [None]:
# Histograms and Boxplots for LNG and LPG capacity
numerical_cols_3 = ['lng_capacity_(m)']

lng_df  = df[df['lng_capacity_(m)'] > 0]


for col in numerical_cols_3:

    plt.figure(figsize=(10, 5))
    sns.histplot(lng_df['lng_capacity_(m)'], bins=20, kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(10, 5))
    sns.boxplot(x=lng_df['lng_capacity_(m)'], showfliers = False)
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

    # Scatter Plots for relationships with charter price
    plt.figure(figsize=(10, 5))
    sns.scatterplot(data=lng_df, x=lng_df['lng_capacity_(m)'], y='charter_price_($/day)')
    plt.title(f'{col} vs Charter Price')
    plt.xlabel(col)
    plt.ylabel('Charter Price ($/day)')
    plt.show()


In [None]:
numerical_cols_4 = ['lpg_capacity_(m)']

lpg_df = df[df['lpg_capacity_(m)'] > 0] 

for col in numerical_cols_4:
 
    plt.figure(figsize=(10, 5))
    sns.histplot(lpg_df['lpg_capacity_(m)'], bins=20, kde=True)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(10, 5))
    sns.boxplot(x=lpg_df['lpg_capacity_(m)'], showfliers = False)
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.show()

    # Scatter Plots for relationships with charter price
    plt.figure(figsize=(10, 5))
    sns.scatterplot(data=lpg_df, x=lpg_df['lpg_capacity_(m)'], y='charter_price_($/day)')
    plt.title(f'{col} vs Charter Price')
    plt.xlabel(col)
    plt.ylabel('Charter Price ($/day)')
    plt.show()


### Size Category and Cargo Type Analysis

In [None]:
# Count Plots for size category and cargo type
categorical_cols = ['size_category', 'cargo_type']
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df, x=col)
    plt.title(f'Frequency of Each {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation = 270)
    plt.show()

# Comparison: Charter price by size category and cargo type
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x=col, y='charter_price_($/day)', showfliers = False)
    plt.title(f'Charter Price by {col}')
    plt.xlabel(col)
    plt.ylabel('Charter Price ($/day)')
    plt.xticks(rotation = 270)
    plt.show()
