## Data science practice questions

In [10]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection 
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier 
import pandas as pd 

def train_and_predict(train_input_features, train_outputs, prediction_features):
    """
    :param train_input_features: (numpy.array) A two-dimensional NumPy array where each element
                        is an array that contains: sepal length, sepal width, petal length, and petal width   
    :param train_outputs: (numpy.array) A one-dimensional NumPy array where each element
                        is a number representing the species of iris which is described in
                        the same row of train_input_features. 0 represents Iris setosa,
                        1 represents Iris versicolor, and 2 represents Iris virginica.
    :param prediction_features: (numpy.array) A two-dimensional NumPy array where each element
                        is an array that contains: sepal length, sepal width, petal length, and petal width
    :returns: (list) The function should return an iterable (like list or numpy.ndarray) of the predicted 
                        iris species, one for each item in prediction_features
    """   
    
    X = np.array(train_input_features)
    Y = np.array(train_outputs)
    #Y = np.reshape(Y,newshape = (len(Y),1))

    seed = 8
    kfold = model_selection.KFold(n_splits = 3, 
                           random_state = seed) 

    # initialize the base classifier 
    base_cls = DecisionTreeClassifier(min_samples_split = 5) 

    # no. of base classifier 
    num_trees = 100

    # bagging classifier 
    model = BaggingClassifier(base_estimator = base_cls, 
                              n_estimators = num_trees, 
                              random_state = seed,
                             max_samples = 50,
                             bootstrap = True)
    fit = model.fit(X,Y)

    results = model_selection.cross_val_score(model, X, Y, cv = kfold) 
    return fit.predict(prediction_features)

iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    test_size=0.3, random_state=0)

y_pred = train_and_predict(X_train, y_train, X_test)
if y_pred is not None:
    print(metrics.accuracy_score(y_test, y_pred))

0.9777777777777777


In [13]:
import pandas as pd
import numpy as np

def login_table(id_name_verified, id_password):
    """
    :param id_name_verified: (DataFrame) DataFrame with columns: Id, Login, Verified.   
    :param id_password: (numpy.array) Two-dimensional NumPy array where each element
                        is an array that contains: Id and Password
    :returns: (None) The function should modify id_name_verified DataFrame in-place. 
              It should not return anything.
    """   
    id_name_verified.drop(["Verified"],axis=1,inplace = True)
    
    pass

id_name_verified = pd.DataFrame([[1, "JohnDoe", True], [2, "AnnFranklin", False]], columns=["Id", "Login", "Verified"])
id_password = np.array([[1, 987340123], [2, 187031122]], np.int32)
login_table(id_name_verified, id_password)
print(id_name_verified)

   Id        Login
0   1      JohnDoe
1   2  AnnFranklin


In [1]:
import pandas as pd
import numpy as np

def most_corr(prices):
    """
    :param prices: (pandas.DataFrame) A dataframe containing each ticker's 
                   daily closing prices.
    :returns: (container of strings) A container, containing the two tickers that 
              are the most highly (linearly) correlated by daily percentage change.
    """
    return None

#For example, the code below should print: ('FB', 'MSFT')
print(most_corr(pd.DataFrame.from_dict({
    'GOOG' : [
        742.66, 738.40, 738.22, 741.16,
        739.98, 747.28, 746.22, 741.80,
        745.33, 741.29, 742.83, 750.50
    ],
    'FB' : [
        108.40, 107.92, 109.64, 112.22,
        109.57, 113.82, 114.03, 112.24,
        114.68, 112.92, 113.28, 115.40
    ],
    'MSFT' : [
        55.40, 54.63, 54.98, 55.88,
        54.12, 59.16, 58.14, 55.97,
        61.20, 57.14, 56.62, 59.25
    ],
    'AAPL' : [
        106.00, 104.66, 104.87, 105.69,
        104.22, 110.16, 109.84, 108.86,
        110.14, 107.66, 108.08, 109.90
    ]
})))

None


In [4]:
prices = pd.DataFrame.from_dict({
    'GOOG' : [
        742.66, 738.40, 738.22, 741.16,
        739.98, 747.28, 746.22, 741.80,
        745.33, 741.29, 742.83, 750.50
    ],
    'FB' : [
        108.40, 107.92, 109.64, 112.22,
        109.57, 113.82, 114.03, 112.24,
        114.68, 112.92, 113.28, 115.40
    ],
    'MSFT' : [
        55.40, 54.63, 54.98, 55.88,
        54.12, 59.16, 58.14, 55.97,
        61.20, 57.14, 56.62, 59.25
    ],
    'AAPL' : [
        106.00, 104.66, 104.87, 105.69,
        104.22, 110.16, 109.84, 108.86,
        110.14, 107.66, 108.08, 109.90
    ]
})

In [12]:
col_list = []
for col in list(prices.columns):
    prices[col+"_pct"] = (prices[col]/prices[col].shift(1))-1
    col_list.append(col+"_pct")
corr = prices[col_list]
corr = corr.dropna()
corr.reset_index(inplace = True, drop = True)
demean = corr - corr.mean()
n = len(demean)
cov = np.array(demean.T)@np.array(demean)/n
def cov2corr(cov):
    # Derive the correlation matrix from a covariance matrix
    std=np.sqrt(np.diag(cov))
    corr=cov/np.outer(std,std)
    corr[corr<-1],corr[corr>1]=-1,1 # numerical error
    return corr
correlation = cov2corr(cov)

np.fill_diagonal(correlation,0)
idx = np.unravel_index(np.argmax(correlation, axis=None), correlation.shape)
string = (list(prices.columns)[idx[0]],list(prices.columns)[idx[1]])

In [27]:
import pandas as pd
import numpy as np
def cov2corr(cov):
    # Derive the correlation matrix from a covariance matrix
    std=np.sqrt(np.diag(cov))
    corr=cov/np.outer(std,std)
    corr[corr<-1],corr[corr>1]=-1,1 # numerical error
    return corr
def most_corr(prices):
    col_list = []
    for col in list(prices.columns):
        prices[col+"_pct"] = (prices[col]/prices[col].shift(1))-1
        col_list.append(col+"_pct")
        corr = prices[col_list]
        print(col)
    corr = corr.dropna()
    corr.reset_index(inplace = True, drop = True)
    demean = corr - corr.mean()
    n = len(demean)
    cov = np.array(demean.T)@np.array(demean)/n
    print(demean)
    
    correlation = cov2corr(cov)
    print(corr)

    np.fill_diagonal(correlation,0)
    idx = np.unravel_index(np.argmax(correlation, axis=None), correlation.shape)
    string = (list(prices.columns)[idx[0]],list(prices.columns)[idx[1]])
    return string

#For example, the code below should print: ('FB', 'MSFT')
print(most_corr(pd.DataFrame.from_dict({
    'GOOG' : [
        742.66, 738.40, 738.22, 741.16,
        739.98, 747.28, 746.22, 741.80,
        745.33, 741.29, 742.83, 750.50
    ],
    'FB' : [
        108.40, 107.92, 109.64, 112.22,
        109.57, 113.82, 114.03, 112.24,
        114.68, 112.92, 113.28, 115.40
    ],
    'MSFT' : [
        55.40, 54.63, 54.98, 55.88,
        54.12, 59.16, 58.14, 55.97,
        61.20, 57.14, 56.62, 59.25
    ],
    'AAPL' : [
        106.00, 104.66, 104.87, 105.69,
        104.22, 110.16, 109.84, 108.86,
        110.14, 107.66, 108.08, 109.90
    ]
})))

GOOG
FB
MSFT
AAPL
    GOOG_pct    FB_pct  MSFT_pct  AAPL_pct
0  -0.006707 -0.010306 -0.021208 -0.016135
1  -0.001214  0.010060 -0.000902 -0.001487
2   0.003012  0.017654  0.009061  0.004326
3  -0.002563 -0.029492 -0.038805 -0.017402
4   0.008895  0.032910  0.085818  0.053502
5  -0.002389 -0.004033 -0.024550 -0.006398
6  -0.006894 -0.021576 -0.044632 -0.012415
7   0.003788  0.015861  0.086134  0.008265
8  -0.006391 -0.021225 -0.073649 -0.026010
9   0.001107 -0.002690 -0.016409  0.000408
10  0.009355  0.012837  0.039141  0.013346
    GOOG_pct    FB_pct  MSFT_pct  AAPL_pct
0  -0.005736 -0.004428 -0.013899 -0.012642
1  -0.000244  0.015938  0.006407  0.002006
2   0.003983  0.023532  0.016370  0.007819
3  -0.001592 -0.023614 -0.031496 -0.013909
4   0.009865  0.038788  0.093126  0.056995
5  -0.001418  0.001845 -0.017241 -0.002905
6  -0.005923 -0.015698 -0.037324 -0.008922
7   0.004759  0.021739  0.093443  0.011758
8  -0.005420 -0.015347 -0.066340 -0.022517
9   0.002077  0.003188 -0.009100  0.

In [28]:
import numpy as np
from sklearn import linear_model

def desired_marketing_expenditure(marketing_expenditure, units_sold, desired_units_sold):
    """
    :param marketing_expenditure: (list) A list of integers with the expenditure for each previous campaign.
    :param units_sold: (list) A list of integers with the number of units sold for each previous campaign.
    :param desired_units_sold: (integer) Target number of units to sell in the new campaign.
    :returns: (float) Required amount of money to be invested.
    """
    return None

#For example, with the parameters below, the function should return 250000.0
print(desired_marketing_expenditure(
    [300000, 200000, 400000, 300000, 100000],
    [60000, 50000, 90000, 80000, 30000],
    60000))

None


In [30]:
X,Y = [300000, 200000, 400000, 300000, 100000],[60000, 50000, 90000, 80000, 30000], 

In [33]:
import statsmodels.api as sm


### X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())