In [5]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Step 2: Load the dataset
# Assuming you've downloaded the 'airline_passenger_satisfaction.csv' file
data=pd.read_csv('https://raw.githubusercontent.com/HWhr3000/F21DL_Coursework_grp2/main/data/airline_passenger_satisfaction.csv')

# Step 3: Explore the data
print(data.head())
print(data.info())
print(data.describe())



   ID  Gender  Age Customer Type Type of Travel     Class  Flight Distance  \
0   1    Male   48    First-time       Business  Business              821   
1   2  Female   35     Returning       Business  Business              821   
2   3    Male   41     Returning       Business  Business              853   
3   4    Male   50     Returning       Business  Business             1905   
4   5  Female   49     Returning       Business  Business             3470   

   Departure Delay  Arrival Delay  Departure and Arrival Time Convenience  \
0                2            5.0                                       3   
1               26           39.0                                       2   
2                0            0.0                                       4   
3                0            0.0                                       2   
4                0            1.0                                       3   

   ...  Seat Comfort  Leg Room Service  Cleanliness  Food and Drink 

In [6]:
# Check for missing values
print(data.isnull().sum())

data['Arrival Delay'].fillna(data['Arrival Delay'].mean(), inplace=True)

# Check for missing values
print(data.isnull().sum())

ID                                          0
Gender                                      0
Age                                         0
Customer Type                               0
Type of Travel                              0
Class                                       0
Flight Distance                             0
Departure Delay                             0
Arrival Delay                             393
Departure and Arrival Time Convenience      0
Ease of Online Booking                      0
Check-in Service                            0
Online Boarding                             0
Gate Location                               0
On-board Service                            0
Seat Comfort                                0
Leg Room Service                            0
Cleanliness                                 0
Food and Drink                              0
In-flight Service                           0
In-flight Wifi Service                      0
In-flight Entertainment           

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Arrival Delay'].fillna(data['Arrival Delay'].mean(), inplace=True)


In [7]:
# Assuming `data` is your DataFrame with the relevant features
# Drop any columns that are not needed, like unique identifiers or target labels
# data = data.drop(['id', 'Satisfaction'], axis=1)

# Assuming 'data' is your DataFrame
data_encoded = pd.get_dummies(data, drop_first=True)  # `drop_first=True` to avoid multicollinearity

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

ValueError: could not convert string to float: 'Male'

In [None]:
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Test different numbers of components
bic_scores = []
silhouette_scores = []
n_clusters_range = range(2, 10)  # Change range as needed

for n_clusters in n_clusters_range:
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm.fit(scaled_data)
    labels = gmm.predict(scaled_data)
    
    # BIC score
    bic = gmm.bic(scaled_data)
    bic_scores.append(bic)
    
    # Silhouette score
    silhouette = silhouette_score(scaled_data, labels)
    silhouette_scores.append(silhouette)

# Plot BIC and silhouette scores to help determine the best number of clusters
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(n_clusters_range, bic_scores, marker='o')
plt.title('BIC Score')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC')

plt.subplot(1, 2, 2)
plt.plot(n_clusters_range, silhouette_scores, marker='o')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
# Choose the optimal number of clusters, for example, 3
optimal_clusters = 3  # Change this based on your results

gmm = GaussianMixture(n_components=optimal_clusters, random_state=42)
gmm.fit(scaled_data)
labels = gmm.predict(scaled_data)
data['Cluster'] = labels  # Add cluster labels to your original DataFrame

In [None]:
# Calculate cluster means
cluster_means = data.groupby('Cluster').mean()
print(cluster_means)

# Visualize the clusters if you have only a few dimensions (e.g., 2 or 3)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=scaled_data[:, 0], y=scaled_data[:, 1], hue=labels, palette='viridis', s=60)
plt.title("Clusters from GMM")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend(title="Cluster")
plt.show()