In [None]:
import pandas as pd
import plotly.express as px

# Load your data
data = pd.read_csv('cleaned_v2.csv')

data.info()


In [None]:
import pandas as pd
import plotly.express as px

# Load your data
data = pd.read_csv('cleaned_v2.csv')

# Create a scatter plot for salary vs location
fig = px.scatter(
    data,
    x='Location',
    y='Salary',  # Default to showing 'Salary From'
    color='Location',  # Color points by location
    title='Salary vs Location',
    height=600,
)

# Update hover information to format salaries with a dollar sign
fig.update_traces(
    hovertemplate=(
        'Job Title: %{hovertext}<br>' +
        'Salary From: $%{y:,.0f}<br>' +  # Added dollar sign here
        'Salary To: $%{customdata[0]:,.0f}<br>' +  # Added dollar sign here
        '<extra></extra>'
    ),
    hovertext=data['Job Title'],
    customdata=data[['Salary To']].values
)



# Show the plot
fig.show()
 

In [None]:
data['Salary To']


In [None]:
pip install --upgrade plotly

In [None]:
# visualizing missing data - Needed

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load your data
data = pd.read_csv('cleaned_v2.csv')

# Create a heatmap to visualize missing data
plt.figure(figsize=(10, 6))
plt.imshow(data.isnull(), cmap='viridis', aspect='auto')
plt.colorbar(label='Missing Value Indicator')
plt.title('Missing Data Heatmap')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.xticks(ticks=np.arange(data.shape[1]), labels=data.columns, rotation=45)
plt.yticks(ticks=np.arange(data.shape[0]), labels=np.arange(1, data.shape[0] + 1))
plt.show()


In [None]:
# Imputation comparisons

# Histogram
# add MICE?

original = pd.read_csv('cleaned_v2.csv')
mean_imputed = pd.read_csv('mean_imputed_v2.csv')
knn_imputed = pd.read_csv('knn_imputed_v2.csv')

# Plot histograms for a specific column
plt.figure(figsize=(12, 6))
plt.subplot(321)
plt.hist(original['Salary From'], bins=50, alpha=0.5, label='Before Imputation', color='blue', edgecolor='black')
plt.title('Original Salary - Low Range')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.subplot(322)
plt.hist(original['Salary To'], bins=50, alpha=0.5, label='Before Imputation', color='blue',edgecolor='black')
plt.title('Original Salary - High Range')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.subplot(323)

plt.hist(mean_imputed['Salary From'], bins=50, alpha=0.5, label='After Imputation', color='orange',edgecolor='black')
plt.title('Mean Imputed Salary - Low Range')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.subplot(324)
plt.hist(mean_imputed['Salary To'], bins=50, alpha=0.5, label='After Imputation', color='orange',edgecolor='black')
plt.title('Mean Imputed Salary - High Range')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.subplot(325)
plt.hist(knn_imputed['Salary From'], bins=50, alpha=0.5, label='After Imputation', color='green',edgecolor='black')
plt.title('KNN Imputed Salary - Low Range')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.subplot(326)
plt.hist(knn_imputed['Salary To'], bins=50, alpha=0.5, label='After Imputation', color='green',edgecolor='black')
plt.title('KNN Imputed Salary - High Range')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()




In [None]:
plt.figure(figsize=(12, 6))
plt.boxplot([original['Salary From'].dropna(), original['Salary To'].dropna(),
            mean_imputed['Salary From'], mean_imputed['Salary To'],
            knn_imputed['Salary From'], knn_imputed['Salary To']],
           labels=['Original - Low', 'Original - High','Mean - Low', 'Mean - High','KNN - Low', 'KNN - High'])


In [None]:



plt.figure(figsize=(12, 6))

# Create the violin plot
parts = plt.violinplot([original['Salary From'].dropna(), original['Salary To'].dropna(),
            mean_imputed['Salary From'], mean_imputed['Salary To'],
            knn_imputed['Salary From'], knn_imputed['Salary To']],
    showmeans=True, showmedians=True
)

for i in range(len(parts['bodies'])):
    # Create the box plot for the current data
    box = plt.boxplot(
        [original['Salary From'].dropna(), original['Salary To'].dropna(),
         mean_imputed['Salary From'].dropna(), mean_imputed['Salary To'].dropna(),
         knn_imputed['Salary From'].dropna(), knn_imputed['Salary To'].dropna()][i],
        positions=[i + 1], widths=0.1, patch_artist=True,
        boxprops=dict(facecolor='cyan', color='black'),
        whiskerprops=dict(color='black'),
        capprops=dict(color='black'),
        medianprops=dict(color='red'),
        showfliers=False  # Hide outliers for clarity
    )

# Set x-tick labels
plt.xticks(
    [1, 2, 3, 4, 5, 6],
    ['Original - Low', 'Original - High',
     'Mean - Low', 'Mean - High',
     'KNN - Low', 'KNN - High']
)

# Add labels and title
plt.xlabel('Salary')
plt.ylabel('$ Amount')
plt.title('Salary Quartiles and Distribution')

# Show the plot
plt.show()

In [None]:
import pandas as pd
import plotly.figure_factory as ff

# Load your data
data = pd.read_csv('cleaned_v2.csv')

# Remove the unnamed column (assuming it's the first column)
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]

# Select only numeric columns for correlation
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
correlation_matrix = numeric_data.corr()

# Flip the order of the columns and rows
correlation_matrix = correlation_matrix[::-1].reset_index().set_index('index')

# Create an interactive heatmap
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    annotation_text=correlation_matrix.round(2).values,
    colorscale='Viridis'
)

# Update layout for better visualization
fig.update_layout(
    title='Interactive Correlation Matrix (Flipped)',
    xaxis_title='Variables',
    yaxis_title='Variables',
    height=600,
    width=800
)

# Show the plot
fig.show()
