In [1]:
# Initial imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Loading and Preprocessing Malware Apps Data

Load the `app-data.csv` in a pandas DataFrame called `df_apps`

In [2]:
# Load data
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_2/datasets/app-data.csv"
df_apps = pd.read_csv(file_path)
df_apps.head()

Unnamed: 0,android.permission.GET_ACCOUNTS,com.sonyericsson.home.permission.BROADCAST_BADGE,android.permission.READ_PROFILE,android.permission.MANAGE_ACCOUNTS,android.permission.WRITE_SYNC_SETTINGS,android.permission.READ_EXTERNAL_STORAGE,android.permission.RECEIVE_SMS,com.android.launcher.permission.READ_SETTINGS,android.permission.WRITE_SETTINGS,com.google.android.providers.gsf.permission.READ_GSERVICES,...,com.android.launcher.permission.UNINSTALL_SHORTCUT,com.sec.android.iap.permission.BILLING,com.htc.launcher.permission.UPDATE_SHORTCUT,com.sec.android.provider.badge.permission.WRITE,android.permission.ACCESS_NETWORK_STATE,com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE,com.huawei.android.launcher.permission.READ_SETTINGS,android.permission.READ_SMS,android.permission.PROCESS_INCOMING_CALLS,Result
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0


Define the features set, by copying the `df_apps` DataFrame and dropping the `Result` column.

In [3]:
# Define features set
X = df_apps.copy()
X.drop("Result", axis=1, inplace=True)
X.head()

Unnamed: 0,android.permission.GET_ACCOUNTS,com.sonyericsson.home.permission.BROADCAST_BADGE,android.permission.READ_PROFILE,android.permission.MANAGE_ACCOUNTS,android.permission.WRITE_SYNC_SETTINGS,android.permission.READ_EXTERNAL_STORAGE,android.permission.RECEIVE_SMS,com.android.launcher.permission.READ_SETTINGS,android.permission.WRITE_SETTINGS,com.google.android.providers.gsf.permission.READ_GSERVICES,...,android.permission.CLEAR_APP_CACHE,com.android.launcher.permission.UNINSTALL_SHORTCUT,com.sec.android.iap.permission.BILLING,com.htc.launcher.permission.UPDATE_SHORTCUT,com.sec.android.provider.badge.permission.WRITE,android.permission.ACCESS_NETWORK_STATE,com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE,com.huawei.android.launcher.permission.READ_SETTINGS,android.permission.READ_SMS,android.permission.PROCESS_INCOMING_CALLS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


Create the target vector by assigning the values of the `Result` column from the `df_apps` DataFrame.

In [4]:
# Define target vector
y = df_apps["Result"].ravel()
y[:5]

array([0, 0, 0, 0, 0])

Split the data into training and testing sets.

In [5]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

## Fitting the Random Forest Model

Create a random forest instance and train it with the training data (`X_train` and `y_train`), define `n_estimators=128` and `random_state=78`.

In [7]:
# Create the random forest classifier instance
clf = RandomForestClassifier(random_state=78, n_estimators=128)
clf

In [9]:
# Fit the model
clf_fit = clf.fit(X_train, y_train)
clf_fit

## Making Predictions Using the Random Forest Model

Validate the trained model by malware apps using the testing data (`X_test`).

In [11]:
# Make predictions using the testing data
predictions = clf_fit.predict(X_test)
predictions
# print(f'Training Score: {clf.score(X_train, y_train)}')
# print(f'Testing Score: {clf.score(X_test, y_test)}')

array([1, 0, 1, ..., 1, 1, 1])

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the accuracy score.

In [12]:
# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)



In [13]:
# Display results
print(f"Accuracy Score : {acc_score}")
# Accuracy Score : 0.967543979271785

Accuracy Score : 0.9672712396018001


## Feature Importance

In this section, you are asked to fetch the features' importance from the random forest model and display the top 10 most important features.

In [14]:
# Get the feature importance array
feature_importances = clf_fit.feature_importances_

# List the top 10 most important features
importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
importances_sorted[:10]

[(0.2526418903981927, 'android.permission.READ_PHONE_STATE'),
 (0.11323230656064623, 'com.google.android.c2dm.permission.RECEIVE'),
 (0.08156488911416043, 'android.permission.RECEIVE_BOOT_COMPLETED'),
 (0.06579127309698081, 'android.permission.ACCESS_COARSE_LOCATION'),
 (0.05883799614741168, 'com.android.launcher.permission.INSTALL_SHORTCUT'),
 (0.043793946991240375, 'android.permission.ACCESS_FINE_LOCATION'),
 (0.03813886967165418, 'android.permission.GET_TASKS'),
 (0.02303354853086105, 'android.permission.SYSTEM_ALERT_WINDOW'),
 (0.022989587959897147, 'android.permission.READ_EXTERNAL_STORAGE'),
 (0.022280304346836505, 'com.android.vending.BILLING')]

## Analysis Questions

Finally, analyze the model's evaluation results and answer the following questions.

* **Question:** Would you trust this model to detect malware? 

    * **Answer:** 

* **Question:** Out of the following models, which one had the highest accuracy score: logistic regression, SVM, KNN, decision tree, or random forest?

    * **Answer:**