In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import preprocessing
from scipy.stats import shapiro
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.decomposition import PCA
from scipy.stats import binom
import re
from collections import Counter

# Q1 

In [2]:
# Import data

q1 = pd.read_csv("ingredient.csv")

#### a. A descriptive analysis of the additives (columns named as “a” to “i”), which must include summaries of findings (parametric/non-parametric). Correlation and ANOVA, if applicable, is a must.

In [3]:
#descriptive analysis

q1.describe(include='all')

Unnamed: 0,a,b,c,d,e,f,g,h,i
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0
25%,1.516522,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51


In [4]:
q1.nunique()

a    178
b    142
c     94
d    118
e    133
f     65
g    143
h     34
i     32
dtype: int64

In [5]:
for column in q1.columns:
	if shapiro(q1[column]).pvalue > 0.05:
		print('Column ', column, ' is normally distributed')
	else:
		print('Column ', column, ' is not normally distributed')

Column  a  is not normally distributed
Column  b  is not normally distributed
Column  c  is not normally distributed
Column  d  is not normally distributed
Column  e  is not normally distributed
Column  f  is not normally distributed
Column  g  is not normally distributed
Column  h  is not normally distributed
Column  i  is not normally distributed


A Non parametric test is chosen (Kendall Correlation) as data is not normally distributed

In [6]:
corr_score = q1.corr(method = 'kendall')

strong_pos, strong_neg, mod_pos, mod_neg, weak_pos, weak_neg = ([] for i in range(6))

for row in corr_score:
	for column in corr_score:
		if corr_score.at[row,column] > 0.5 and corr_score.at[row,column] < 1 :
			if [column,row] not in strong_pos:
				strong_pos.append([row,column])
		if corr_score.at[row,column] > 0.3 and corr_score.at[row,column] < 0.5 :
			if [column,row] not in mod_pos:
				mod_pos.append([row,column])
		if corr_score.at[row,column] > 0 and corr_score.at[row,column] < 0.3 :
			if [column,row] not in weak_pos:
				weak_pos.append([row,column])
		if corr_score.at[row,column] < -0.5 and corr_score.at[row,column] > -1 :
			if [column,row] not in strong_neg:
				strong_neg.append([row,column])
		if corr_score.at[row,column] < -0.3 and corr_score.at[row,column] > -0.5 :
			if [column,row] not in mod_neg:
				mod_neg.append([row,column])
		if corr_score.at[row,column] < 0 and corr_score.at[row,column] > -0.3 :
			if [column,row] not in weak_neg:
				weak_neg.append([row,column])
print('Variables with strong postivite correlation: \n', ' '.join(str(x) for x in strong_pos) )
print('Variables with moderate postivite correlation: \n', ' '.join(str(x) for x in mod_pos) )
print('Variables with weak postivite correlation: \n', ' '.join(str(x) for x in weak_pos) )
print('Variables with strong negative correlation: \n', ' '.join(str(x) for x in strong_neg) )
print('Variables with moderate negative correlation: \n', ' '.join(str(x) for x in mod_neg) )
print('Variables with weak negative correlation: \n', ' '.join(str(x) for x in weak_neg) )

fig = px.imshow(corr_score, text_auto=True, aspect="auto", color_continuous_scale='RdBu_r')
fig.show()

Variables with strong postivite correlation: 
 ['a', 'g']
Variables with moderate postivite correlation: 
 ['b', 'h'] ['d', 'h']
Variables with weak postivite correlation: 
 ['a', 'b'] ['a', 'c'] ['a', 'i'] ['b', 'd'] ['b', 'g'] ['c', 'f'] ['c', 'i'] ['d', 'e'] ['d', 'f'] ['e', 'f'] ['e', 'h'] ['f', 'i'] ['g', 'i'] ['h', 'i']
Variables with strong negative correlation: 
 
Variables with moderate negative correlation: 
 ['a', 'd'] ['a', 'e'] ['b', 'f'] ['c', 'd'] ['c', 'h'] ['f', 'g']
Variables with weak negative correlation: 
 ['a', 'f'] ['a', 'h'] ['b', 'c'] ['b', 'e'] ['b', 'i'] ['c', 'e'] ['c', 'g'] ['d', 'g'] ['d', 'i'] ['e', 'g'] ['e', 'i'] ['f', 'h'] ['g', 'h']


#### b. A graphical analysis of the additives, including a distribution study.

Data in each column is normalized to study the distribution and further analysis

In [7]:
q1_norm = pd.DataFrame()
for column in q1.columns: 
	x = q1[column].values.reshape(-1,1) 
	min_max_scaler = preprocessing.MinMaxScaler()
	x_scaled = min_max_scaler.fit_transform(x)
	q1_norm[column] = pd.DataFrame(x_scaled)

hist_data = [q1_norm.a, q1_norm.b, q1_norm.c, q1_norm.d, q1_norm.e, q1_norm.f, q1_norm.g, q1_norm.h, q1_norm.i]
fig = ff.create_distplot(hist_data, q1.columns, show_hist=False)
fig.update_layout(title_text='Distribution of data')
fig.show()

fig2 = px.box(q1_norm)
fig2.update_traces(quartilemethod="exclusive")
fig2.show()

#### c. A clustering test of your choice (unsupervised learning), to determine the distinctive number of formulations present in the dataset.

In [8]:
#using PCA to visualize the labels from GMM model
pca = PCA()
Xt = pca.fit_transform(q1_norm)
from sklearn.cluster import OPTICS
gm = OPTICS(min_samples=20).fit(q1_norm).labels_

fig = px.scatter_3d(
    Xt, x=0, y=1, z=2,
    color=list(map(str,gm)),
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}, title='Visualization of predicted groups of formulations in data'
)
fig.update_traces(marker_size = 3)
fig.show()

2 clusters are predicted by OPTICS. The points labelled as `-1` are noise.

# Q2 

loading data and only selecting months of each row

In [9]:
q2 = pd.read_csv('palm_ffb.csv')
q2['Date'] = pd.to_datetime(q2['Date'],  format='%d.%m.%Y').dt.month

In [10]:
corr_score = q2.corr(method = 'kendall')

strong_pos, strong_neg, mod_pos, mod_neg, weak_pos, weak_neg = ([] for i in range(6))

for row in corr_score:
	for column in corr_score:
		if corr_score.at[row,column] > 0.5 and corr_score.at[row,column] < 1 :
			if [column,row] not in strong_pos:
				strong_pos.append([row,column])
		if corr_score.at[row,column] > 0.3 and corr_score.at[row,column] < 0.5 :
			if [column,row] not in mod_pos:
				mod_pos.append([row,column])
		if corr_score.at[row,column] > 0 and corr_score.at[row,column] < 0.3 :
			if [column,row] not in weak_pos:
				weak_pos.append([row,column])
		if corr_score.at[row,column] < -0.5 and corr_score.at[row,column] > -1 :
			if [column,row] not in strong_neg:
				strong_neg.append([row,column])
		if corr_score.at[row,column] < -0.3 and corr_score.at[row,column] > -0.5 :
			if [column,row] not in mod_neg:
				mod_neg.append([row,column])
		if corr_score.at[row,column] < 0 and corr_score.at[row,column] > -0.3 :
			if [column,row] not in weak_neg:
				weak_neg.append([row,column])
print('Variables with strong postivite correlation: \n', ' '.join(str(x) for x in strong_pos) )
print('Variables with moderate postivite correlation: \n', ' '.join(str(x) for x in mod_pos) )
print('Variables with weak postivite correlation: \n', ' '.join(str(x) for x in weak_pos) )
print('Variables with strong negative correlation: \n', ' '.join(str(x) for x in strong_neg) )
print('Variables with moderate negative correlation: \n', ' '.join(str(x) for x in mod_neg) )
print('Variables with weak negative correlation: \n', ' '.join(str(x) for x in weak_neg) )

fig = px.imshow(corr_score, text_auto=True, aspect="auto", color_continuous_scale='RdBu_r')
fig.show()

Variables with strong postivite correlation: 
 ['Date', 'FFB_Yield'] ['Average_Temp', 'Max_Temp']
Variables with moderate postivite correlation: 
 ['Date', 'Precipitation'] ['SoilMoisture', 'Precipitation'] ['Average_Temp', 'HA_Harvested']
Variables with weak postivite correlation: 
 ['Date', 'SoilMoisture'] ['Date', 'Min_Temp'] ['SoilMoisture', 'Min_Temp'] ['Average_Temp', 'Min_Temp'] ['Average_Temp', 'Working_days'] ['Min_Temp', 'Precipitation'] ['Min_Temp', 'FFB_Yield'] ['Max_Temp', 'HA_Harvested'] ['Precipitation', 'Working_days'] ['Precipitation', 'FFB_Yield'] ['Working_days', 'HA_Harvested'] ['Working_days', 'FFB_Yield']
Variables with strong negative correlation: 
 
Variables with moderate negative correlation: 
 ['Date', 'HA_Harvested'] ['SoilMoisture', 'Average_Temp'] ['SoilMoisture', 'Max_Temp'] ['Max_Temp', 'Precipitation']
Variables with weak negative correlation: 
 ['Date', 'Average_Temp'] ['Date', 'Max_Temp'] ['Date', 'Working_days'] ['SoilMoisture', 'Working_days'] ['Soi

In [11]:
fig = px.scatter_matrix(q2, color="FFB_Yield")
fig.update_traces(showupperhalf=False, diagonal_visible=False)
fig.update_layout(dragmode='select',height=1000)
fig.show()

Meaningful correlation observed are: 
 - strong correlation between the date(month) and the FFB yield

---

To further investigate the variables, we will use:
 - F-test to test for the linear dependency of variable against the yield
 - Mutual information for the non-linear dependncies against the yield

In [12]:
from sklearn.feature_selection import f_regression, mutual_info_regression
X= q2.drop(columns=['FFB_Yield'])
y = q2.FFB_Yield
f_test, _ = f_regression(X, y)
f_test /= np.max(f_test)

mi = mutual_info_regression(X, y)
mi /= np.max(mi)

titles_1 =  ["F-test={:.2f}".format(x) for x in f_test]
titles_2 =  [", Mi={:.2f}".format(x) for x in mi]
titles = list(map(str.__add__, titles_1, titles_2))
fig3 = make_subplots(rows = 2, cols=4, subplot_titles=titles)
i = 0
for row in range(2):
	for col in range(4):
		fig3.add_trace(go.Scatter(x=X.iloc[:, i], y=y,mode="markers"), row=row+1, col=col+1)
		fig3.update_xaxes(title_text=X.columns[i],  row=row+1, col=col+1)
		if col == 0:
			fig3.update_yaxes(title_text="y",  row=row+1, col=col+1)
		i += 1
fig3.update_annotations(font_size=10)
fig3.show()

In [13]:
test_res = pd.DataFrame()
test_res['Variable'] = X.columns
test_res['F-test'] = f_test
test_res['Mutual_info'] = mi
print (test_res.sort_values(by=['F-test','Mutual_info'], ascending=False))
print (test_res.sort_values(by=['Mutual_info','F-test'], ascending=False))

        Variable    F-test  Mutual_info
0           Date  1.000000     1.000000
7   HA_Harvested  0.170448     0.338414
5  Precipitation  0.111616     0.202970
6   Working_days  0.016735     0.000000
3       Min_Temp  0.013287     0.143185
4       Max_Temp  0.006212     0.000000
2   Average_Temp  0.000037     0.011847
1   SoilMoisture  0.000012     0.120434
        Variable    F-test  Mutual_info
0           Date  1.000000     1.000000
7   HA_Harvested  0.170448     0.338414
5  Precipitation  0.111616     0.202970
3       Min_Temp  0.013287     0.143185
1   SoilMoisture  0.000012     0.120434
2   Average_Temp  0.000037     0.011847
6   Working_days  0.016735     0.000000
4       Max_Temp  0.006212     0.000000


#### Conclusions:
From the Mutual information and F-test results, we can observe:
 - date(`month`) has the most effect on the yield. 
 - `HA_Harvested`, `Precipitation`, `SoilMoisture` and `Min_Temp` are weak of non-linear dependencies  
 -  `Average_Temp`, `Working_days` and  `Max_Temp` are negligible

# Q3

In [14]:
para = """As a term, data analytics predominantly refers to an assortment of applications, from basic business 
intelligence (BI), reporting and online analytical processing (OLAP) to various forms of advanced 
analytics. In that sense, it's similar in nature to business analytics, another umbrella term for 
approaches to analyzing data -- with the difference that the latter is oriented to business uses, while 
data analytics has a broader focus. The expansive view of the term isn't universal, though: In some 
cases, people use data analytics specifically to mean advanced analytics, treating BI as a separate 
category. Data analytics initiatives can help businesses increase revenues, improve operational 
efficiency, optimize marketing campaigns and customer service efforts, respond more quickly to 
emerging market trends and gain a competitive edge over rivals -- all with the ultimate goal of 
boosting business performance. Depending on the particular application, the data that's analyzed 
can consist of either historical records or new information that has been processed for real-time 
analytics uses. In addition, it can come from a mix of internal systems and external data sources. At 
a high level, data analytics methodologies include exploratory data analysis (EDA), which aims to find 
patterns and relationships in data, and confirmatory data analysis (CDA), which applies statistical 
techniques to determine whether hypotheses about a data set are true or false. EDA is often 
compared to detective work, while CDA is akin to the work of a judge or jury during a court trial -- a 
distinction first drawn by statistician John W. Tukey in his 1977 book Exploratory Data Analysis. Data 
analytics can also be separated into quantitative data analysis and qualitative data analysis. The 
former involves analysis of numerical data with quantifiable variables that can be compared or 
measured statistically. The qualitative approach is more interpretive -- it focuses on understanding 
the content of non-numerical data like text, images, audio and video, including common phrases, 
themes and points of view."""

#### a. What is the probability of the word “data” occurring in each line ?

In [15]:
para_line = para.splitlines()
line_w_str = 0
for line in para_line:
	if line.find('data') != -1:
		line_w_str += 1


n = len(para_line)
p = line_w_str/len(para_line)
binom_prob = binom.pmf(n, n, p) 
print('probability of "data" in line: ', binom_prob)

probability of "data" in line:  1.6169196351928867e-06


In [16]:
para_strip = re.sub(r'([^\w\s\d\'])+', '', re.sub(r'([\n])+', '', para.lower()))

#### b. What is the distribution of distinct word counts across all the lines ?

In [17]:
count = Counter(para_strip.split())

count_df = pd.DataFrame.from_dict(count,orient='index').reset_index().rename(columns={'index':'word', 0:'frequency'})
fig_count = px.bar(count_df.sort_values(by=['frequency'],ascending=False), x='word', y='frequency')
fig_count.show()

#### c. What is the probability of the word “analytics” occurring after the word “data” ?

In [18]:
print('probability of the word “analytics” occurring after the word “data”: ', para_strip.count("data analytics")/para_strip.count("data"))

probability of the word “analytics” occurring after the word “data”:  0.3333333333333333
