## Questions to answer !
Which factor influenced a candidate in getting placed?<br>
Does percentage matters for one to get placed?<br>
Which degree specialization is much demanded by corporate?<br>
Play with the data conducting all statistical tests.<br>

In [45]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

from scipy import stats
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency
from scipy.stats import chi2
from statsmodels.formula.api import ols
from plotly.subplots import make_subplots

In [62]:
"""
Continuous variable
"""
def get_central_tendency(df, col_name):
    """
    df: Input data frame
    col_name: Column name in the input dataframe
    """
    if col_name in df.columns:      
        arguments = {'mean': df[col_name].mean(),
                     'median': df[col_name].median(),
                     'mode': stats.mode(df[col_name]),
                     'min': df[col_name].min(),
                     'max': df[col_name].max(),
                     'skew': df[col_name].skew(),
                     'kurtosis': df[col_name].kurtosis(),
                     'std': data['salary'].std(),
                     'var': data['salary'].var(),                     
                     '25th_perc': data['salary'].quantile(0.25),
                     '75th_perc': data['salary'].quantile(0.75),
                     'IQR': stats.iqr(data.loc[~data[col_name].isnull(), col_name])
                    }
        
        print("\n -----------CENTRAL TENDENCY OF {}-----------".format(str(col_name).upper()))
        print("""\n Mean: {mean} 
                 \n Median: {median} 
                 \n Mode: {mode}
                 \n Skewness: {skew}
                 \n Kurtosis: {kurtosis}
              """.format(**arguments))
        
        print("\n -----------MEASURE OF DISPERSION OF {}-----------".format(str(col_name).upper()))
        print("""\n Standard deviation: {std} 
                 \n Variance: {var}
                 \n Range of values: {min} - {max}
                 \n Inter quartile range: {IQR}
                 \n 25th Percentile: {25th_perc}
                 \n 75th Percentile: {75th_perc}
              """.format(**arguments))
        
    else:
        print("Requested column not in the given dataframe")
        
def get_ecdf(df, col_name):
    """
    Plot the emprical distributed value of a continuos variable
    df: Input data frame
    col_name: Column name in the input dataframe
    """
    x = np.empty([0])
    y = np.empty([0])
    if col_name in df.columns:      
        x = np.sort(np.sort(df[col_name]))
        n = x.size
        y = np.arange(1, n+1) / n
    else:
        print("Requested column not in the given dataframe")
    return x, y

def plot_all_distribution(df, col_name):
    """
    Plot the histogram, boxplot and ECDF
    """
    if col_name in df.columns:
        # Subplots
        fig = make_subplots(rows=1, 
                            cols=3)

        # Plot histogram
        fig.add_trace(go.Histogram(x=df[col_name], 
                                   name='Histogram'),
                      row=1,
                      col=1)

        # Plot box plot
        fig.add_trace(go.Box(y=df[col_name],
                             boxpoints='all',
                             name='Boxplot'),
                      row=1,
                      col=2)

        # Plot ECDF function
        x, y = get_ecdf(df, col_name)
        fig.add_trace(go.Scatter(x=x, 
                                 y=y, 
                                 mode='markers',
                                 name='ECDF'),
                      row=1,
                      col=3)

        fig.update_layout(title = {'text': "{} VARIABLE DISTRIBUTION".format(str(col_name).upper()),
                                   'x': 0.5})

        fig.show()

"""
Categorical variable
"""
def get_frequency_table(df, col_name):
    """
    Returns frequency table
    """
    if col_name in df.columns:
        # Absolute frequency
        cat_frequency = data['specialisation'].value_counts().reset_index()
        
        # Column rename
        cat_frequency.rename(columns={'index': 'specialisation',
                                     'specialisation': 'Absolute frequency'}, 
                            inplace=True)
        
        # Absolute frequency (in percentage)
        cat_frequency['Relative frequency (%)'] = (cat_frequency['Absolute frequency'] / 
                                               cat_frequency['Absolute frequency'].sum()) * 100
        
        return cat_frequency.sort_values(['Absolute frequency'], ascending=False)
        
    else:
        print("Requested column not in the given dataframe")
        
def plot_cat_data(df_frequency, col_name):
    """
    Plot frequency distribution as bar chart
    df_frequency: Input data frame with value counts and %
    col_name: Column name in the input dataframe
    """
    # Create 2 columns plots
    fig = make_subplots(rows=1,
                   cols=2)
    
    # Plot value count in bar chart in first plot
    fig.add_trace(go.Bar(x=df_frequency[col_name],
                         y=df_frequency['Absolute frequency'],
                         text=df_frequency['Absolute frequency'],
                         name='Absolute frequency'),
                 row=1,
                 col=1)
    
    # Plot % of value count in bar chart in second plot
    fig.add_trace(go.Bar(x=df_frequency[col_name],
                         y=df_frequency['Relative frequency (%)'],
                         text=df_frequency['Relative frequency (%)'],
                         texttemplate='%{text:.2f}' + "%",
                         name='Relative frequency %'),
                 row=1,
                 col=2)
    
    fig.update_traces(textposition='outside')
    fig.update_layout(title = {'text': "{} FREQUENCY DISTRIBUTION".format(str(col_name).upper()),
                               'x': 0.5})
    fig.show()

    
def continuous_bivariate(df, col_list):
    """
    Find the relation between 2 numerical cols by ploting scatter plot and pearson co-efficient matrix
    Data: Input data frame
    col_list: list of numerical cols
    """
    # Get Correlation matrix
    corr_matrix = df[numerical_cols].corr()
    corr_matrix = corr_matrix.round(3)
    
    fig = ff.create_annotated_heatmap(z=corr_matrix.values,
                                  x=list(corr_matrix.columns),
                                  y=list(corr_matrix.index),
                                  colorscale='Reds',
                                  annotation_text=corr_matrix.values,
                                  showscale=True
                                 )
    fig.update_layout(title = {'text': 'Pearson correlation Matrix', 
                               'x': 0.5},
                     height=500,
                     width=800)

    fig.show()
    
    # Get scatterplot corelation
    fig = px.scatter_matrix(data, dimensions=numerical_cols, color='status')
    
    fig.update_layout(title = {'text': 'Scatter plot correlation Matrix', 
                               'x': 0.5})
    fig.show()
    
    
def get_F_Stats(df, numerical_col, nominal_col, annova_type=1):
    """
    Do ANNOVA test and get F statistics
    df: input data
    numerical_col: Name of the numerical col in df
    nominal_col: Name of the nominal/categorical col in df
    """
    if not df.empty:
        # Method 1
        query_args = {'numerical_col': numerical_col,
                     'nominal_col': nominal_col}
        mod = ols('{numerical_col} ~ {nominal_col}'.format(**query_args), data=df).fit()
        print(sm.stats.anova_lm(mod, typ=annova_type))

        # Method 2
        # groups = {each_status: df.loc[df.status == each_status, 'degree_p'].values for each_status in data.status.unique()}
        # f_oneway(groups['Placed'], groups['Not Placed'])
        
    
def get_chi_stats(df, nominal_col1, nominal_col2):
    """
    Do Chi sqaured test of independence
    df: input data
    nominal_col1: Name of the nominal/categorical col in df
    nominal_col2: Name of the nominal/categorical col in df
    """
    if not df.empty:
        observed = pd.crosstab(index=df[nominal_col1],
                       columns=df[nominal_col2],
                       margins=True,
                       margins_name='Total')
        chi2_value, p_value, degree_of_freedom, expected = chi2_contingency(observed, correction=False)
        print("Chi Statistics",chi2_value, 
              "\nProbability value",p_value,
              "\nDegree of freedom", degree_of_freedom)

#Ref
# # Add shapes
# fig.add_shape(
#         # Line Vertical
#         dict(
#             type="line",
#             x0=data.salary.mean(),
#             x1=data.salary.mean(),
#             y0=0,
#             y1=100,
#             line=dict(
#                 color="RoyalBlue",
#                 width=3
#             )
# ))

## Read data

In [4]:
data = pd.read_csv('Placement_Data_Full_Class.csv')

target_cols = ['status', 'salary']
predictor_cols = list(set(data.columns).difference(set(target_cols)))
character_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']
numerical_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary']
continuous_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'salary']
nominal_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']
ordinal_cols = []

data.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


### UNIVARAIATE ANALYSIS

In [5]:
# Continuos variable analysis
# get_central_tendency(data, 'salary')
# plot_all_distribution(data, 'salary')

# # Categorical Analysis
# freq_table = get_frequency_table(data, 'specialisation')
# plot_cat_data(freq_table, 'specialisation')

### BIVARAIATE ANALYSIS

In [6]:
# Continous - continous
# continuous_bivariate(data, numerical_cols)

In [7]:
pd.crosstab(data.specialisation, 
            data.status, 
            margins=True, 
            margins_name='Total')

status,Not Placed,Placed,Total
specialisation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mkt&Fin,25,95,120
Mkt&HR,42,53,95
Total,67,148,215


### HYPOTHESIS TESTING

In [41]:
get_F_Stats(data, 'degree_p', 'status')

             df       sum_sq      mean_sq          F        PR(>F)
status      1.0  2668.406406  2668.406406  63.719176  8.807682e-14
Residual  213.0  8919.929587    41.877604        NaN           NaN


In [61]:
get_chi_stats(data, 'status', 'specialisation')

Chi Statistics 13.508014470676486 
Probability value 0.009042700123215265 
Degree of freedom 4
