https://towardsdatascience.com/how-to-analyze-survey-data-with-python-84eff9cc9568

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
pip install pyreadstat

Collecting pyreadstatNote: you may need to restart the kernel to use updated packages.
  Downloading pyreadstat-1.1.9-cp39-cp39-win_amd64.whl (2.4 MB)
Installing collected packages: pyreadstat
Successfully installed pyreadstat-1.1.9



In [3]:
import pyreadstat

In [7]:
df,meta = pyreadstat.read_sav("Surveydata.sav")

In [8]:
df.head()

Unnamed: 0,Sat_overall,Sat_service,Sat_product,NPS,Age,Region
0,1.0,5.0,5.0,2.0,3.0,1.0
1,2.0,1.0,4.0,3.0,1.0,1.0
2,2.0,5.0,3.0,1.0,2.0,4.0
3,4.0,1.0,3.0,1.0,2.0,2.0
4,4.0,1.0,4.0,2.0,3.0,3.0


In [9]:
meta.column_labels

['How satisfied are you overall?',
 'How satisfied are you with our service?',
 'How satisfied are you with our products?',
 'Would you recommend us?',
 'How old are you?',
 'Which region do you come from?']

In [11]:
meta_dict =dict(zip(meta.column_names,meta.column_labels))

In [12]:
meta_dict

{'Sat_overall': 'How satisfied are you overall?',
 'Sat_service': 'How satisfied are you with our service?',
 'Sat_product': 'How satisfied are you with our products?',
 'NPS': 'Would you recommend us?',
 'Age': 'How old are you?',
 'Region': 'Which region do you come from?'}

In [13]:
df["Age"].value_counts(normalize=True).sort_index()

1.0    0.291304
2.0    0.330435
3.0    0.378261
Name: Age, dtype: float64

In [14]:
df["Age"].map(meta.variable_value_labels["Age"]).value_counts(normalize=True)

>50      0.378261
30-50    0.330435
<30      0.291304
Name: Age, dtype: float64

In [16]:
df["Age"].map(meta.variable_value_labels["Age"]).value_counts(normalize=True).loc[meta.variable_value_labels["Age"].values()]

<30      0.291304
30-50    0.330435
>50      0.378261
Name: Age, dtype: float64

In [17]:
pd.crosstab(df["Sat_overall"],df["Age"],dropna=True,normalize="columns")

Age,1.0,2.0,3.0
Sat_overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.208955,0.223684,0.252874
2.0,0.238806,0.197368,0.229885
3.0,0.164179,0.236842,0.16092
4.0,0.104478,0.184211,0.195402
5.0,0.283582,0.157895,0.16092


In [19]:
pd.crosstab(df["Sat_overall"].map(meta.variable_value_labels["Sat_overall"]),df["Age"].map(meta.variable_value_labels["Age"]),\
    dropna=True, normalize="columns").loc[meta.variable_value_labels["Sat_overall"].values()].loc[:,meta.variable_value_labels["Age"].values()]*100

Age,<30,30-50,>50
Sat_overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Completly satisfied,20.895522,22.368421,25.287356
Very satisfied,23.880597,19.736842,22.988506
Satisfied,16.41791,23.684211,16.091954
Less satisfied,10.447761,18.421053,19.54023
Dissatisfied,28.358209,15.789474,16.091954


In [22]:
weight = np.NaN
df.loc[(df["Age"]==1),"weight"] = 0.5/(67/230)
df.loc[(df["Age"]==2),"weight"] = 0.25/(76/230)
df.loc[(df["Age"]==3),"weight"] = 0.25/(87/230)

In [25]:
def weighted_frequency (x,y) :
    a = pd.Series(df[[x,y]].groupby(x).sum()[y]/df[y].sum())
    b = a.index.map(meta.variable_value_labels[x])
    c = a.values
    df_temp = pd.DataFrame({"Labels":b,"Fruquency":c})
    return df_temp

In [26]:
weighted_frequency("Age","weight")

Unnamed: 0,Labels,Fruquency
0,<30,0.5
1,30-50,0.25
2,>50,0.25


In [27]:
weighted_frequency("Sat_overall","weight")

Unnamed: 0,Labels,Fruquency
0,Completly satisfied,0.223617
1,Very satisfied,0.226216
2,Satisfied,0.18153
3,Less satisfied,0.147142
4,Dissatisfied,0.221495


In [28]:
pd.crosstab(df['Sat_overall']. \
        map(meta.variable_value_labels['Sat_overall']), \
        df['Age'].map(meta.variable_value_labels['Age']), 
        df.weight, aggfunc = sum, dropna=True, \
        normalize='columns'). \
        loc[meta.variable_value_labels['Sat_overall'].values()]. \
        loc[:,meta.variable_value_labels['Age'].values()]*100

Age,<30,30-50,>50
Sat_overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Completly satisfied,20.895522,22.368421,25.287356
Very satisfied,23.880597,19.736842,22.988506
Satisfied,16.41791,23.684211,16.091954
Less satisfied,10.447761,18.421053,19.54023
Dissatisfied,28.358209,15.789474,16.091954
