In [62]:
import pandas as pd
import numpy as np
import seaborn as sns

In [17]:
dataset = pd.read_csv("../input/kcse-dataset/KCSE_data.csv")
#Taking a quick look at the data structure.
dataset.head()
dataset.keys()
#Getting a quick description of the data.
dataset.info()
#There are 36 instances in the dataset(small dataset).
#All attributes have 36 non-null values meaning their is no missing features.
#All attributes except gender are numerical (integers). Gender having this data in csv form is text meaning it is a categorical attribute.

dataset["Gender"].value_counts()

In [15]:
#supervised learning using labeled data.

dataset["A"].value_counts()
#describe() shows a summary of the numerical attributes.
dataset.describe()

In [4]:
#To picture the data graphically, we use histograms.
%matplotlib inline
import matplotlib.pyplot as plt
dataset.hist(bins=50, figsize=(20,15))
plt.show()

In [33]:
#dataset["high"] = pd.cut(dataset["A"], bins=[0, 1.5, 3.0, 4.5, 6., np.inf], labels=[1,2,3,4,5])
#dataset['high'].hist()

In [5]:
#To get various corelations between the Genders and various grades.
dataset.plot(kind="scatter", x="Gender", y="A", figsize=(40,3), cmap=plt.get_cmap("jet"))
dataset.plot(kind="scatter", x="Gender", y="E", figsize=(40,3), cmap=plt.get_cmap("jet"))
#The images shows how the best and worst grades evolved over the years for both genders and generally. 

In [6]:
corr_matrix = dataset.corr()
corr_matrix["A"].sort_values(ascending=False)

#Since the dataset is small, the correlation can be easily computed between every pair of attributes. 
#In this ccase we see the correlation of grade A and all other grades.
#When close to 1, it means their is a strong positive correlation. for example, grade A- tends to go up when A goes up.
#When coefficient is close to -1, it means there is a strong negative correlation(you cans see a small negative correlation between grade E and A. A has a tendency to go down when E increases.)


In [7]:
#Another way to check for correlation between attributes is to use Pandas' scatter_matrix function

from pandas.plotting import scatter_matrix

attributes = ["Gender", "A", "A-", "B", "C", "D", "E"]
scatter_matrix(dataset[attributes], figsize=(12,8))

In [18]:
#Splitting the gender column into gender and year to gain better insight.
dataset[["Gender", "Year"]] = dataset.Gender.str.split("(", expand=True,)
dataset

In [21]:
#Removing the close bracket in the year column.
dataset["Year"] = dataset["Year"].str.strip(")")
dataset

In [39]:
#Getting the total number of of students per year.
dataset["Total"] = dataset.sum(axis=1)
dataset.head()

In [26]:
#Arranging column in order

dataset = dataset[["Year", "Gender","A","A-","B+","B","B-","C+","C","C-","D+","D","D-","E"]]
dataset.head()

In [35]:
#Getting students who qualified to join campus(C+ and above)

dataset_pass = dataset[dataset.columns[:8]]
dataset_pass

dataset_pass["Pass"] = dataset_pass.sum(axis=1)
dataset_pass.head()

dataset1 = dataset.join(dataset_pass[["Pass"]])
dataset1

In [54]:
#Showing the number of candidates who took the exams over the years.

sns.catplot(x="Year", y="Total", hue_order=["ALL"], kind="bar", data=dataset, height = 5, aspect= 1.5)

#From the graph, we can see an increase of students taking the exam over the years

#labeling the graph.
plt.xlabel("Year of Exam",fontsize=15)
plt.ylabel("Total number of candidates",fontsize=15)
plt.title("Total Number Candidates Per Year",fontsize=20)

In [55]:
#showing the candidates gender wise.

sns.catplot(x="Year", y="Total", hue = "Gender", kind="bar", hue_order=["FEMALE", "MALE"], data=dataset, height = 7, aspect = 2.5)


In [61]:
#Showing grades per gender.
sns.relplot(x='Year',y='A', hue ="Gender", kind='line',data=dataset, hue_order=['MALE',"FEMALE"],height=6,aspect=2)
sns.relplot(x='Year',y='A-', hue ="Gender", kind='line',data=dataset, hue_order=['MALE',"FEMALE"],height=6,aspect=2)
sns.relplot(x='Year',y='B+', hue ="Gender", kind='line',data=dataset, hue_order=['MALE',"FEMALE"],height=6,aspect=2)
sns.relplot(x='Year',y='B', hue ="Gender", kind='line',data=dataset, hue_order=['MALE',"FEMALE"],height=6,aspect=2)
sns.relplot(x='Year',y='B-', hue ="Gender", kind='line',data=dataset, hue_order=['MALE',"FEMALE"],height=6,aspect=2)
sns.relplot(x='Year',y='C+', hue ="Gender", kind='line',data=dataset, hue_order=['MALE',"FEMALE"],height=6,aspect=2)

In [59]:
dataset[["Gender","A","A-","B+","B","B-","C+","C","C-","D+","D","D-","E"]].plot(x="Gender",kind="bar",stacked=True,figsize=(14,14))
plt.legend(bbox_to_anchor=(1.02,1), loc="upper left",borderaxespad=0)