In [1]:
# This is a sample project created using dataramp

# Dataramp simplifies the way you structure your data science projects
# After installation, simpy run 'dr.core.create_project("your-project-name")' to get started

In [2]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd

import dataramp as dr

In [3]:
print(dr.__version__) # check library version

0.1.8.deva55a370


In [4]:
# Some of the built-in functionalities in dataramp you can use
from dataramp.utils import (
    describe_df,
    display_missing,
    feature_summary,
    get_cat_vars,
    get_num_vars,
    get_unique_counts,
)

In [5]:
from sklearn.datasets import load_iris

In [6]:
# Load your dataset
# df = pd.read_csv("../../datasets/raw/iris.csv")

In [7]:
iris = load_iris()

In [8]:
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='species')

In [9]:
# X['id'] = range(len(X))
# X['species'] = y

In [10]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [11]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int64

In [12]:
df = pd.concat([X, y], axis=1)

In [13]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [14]:
display_missing(df, plot=False)

Unnamed: 0,variable,missing_count,missing_percent
0,sepal length (cm),0,0.0
1,sepal width (cm),0,0.0
2,petal length (cm),0,0.0
3,petal width (cm),0,0.0
4,species,0,0.0


In [15]:
describe_df(df.select_dtypes(include=['number'])) # description of the numerical variables

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal length (cm),150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal width (cm),150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal length (cm),150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal width (cm),150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5
species,150.0,1.0,0.819232,0.0,0.0,1.0,2.0,2.0


In [16]:
describe_df(df.select_dtypes(include=['number'])) # description of the categorical variables

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal length (cm),150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal width (cm),150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
petal length (cm),150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
petal width (cm),150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5
species,150.0,1.0,0.819232,0.0,0.0,1.0,2.0,2.0


In [17]:
feature_summary(df.select_dtypes(include=['number']))

Unnamed: 0,Null,Unique_Count,Data_type,Max,Min,Mean,Std,Skewness
sepal length (cm),0,35,float64,7.9,4.3,5.843333,0.828066,0.314911
sepal width (cm),0,23,float64,4.4,2.0,3.057333,0.435866,0.318966
petal length (cm),0,43,float64,6.9,1.0,3.758,1.765298,-0.274884
petal width (cm),0,22,float64,2.5,0.1,1.199333,0.762238,-0.102967
species,0,3,int64,2.0,0.0,1.0,0.819232,0.0


In [18]:
get_cat_vars(df)

[]

In [19]:
get_num_vars(df)

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)',
 'species']

In [20]:
get_unique_counts(df)

Unnamed: 0,Feature,Unique Count


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [30]:
from dataramp.utils import check_train_test_set

In [31]:
check_train_test_set(
    train_data=X_train,
    test_data=X_test,
    index=None,
    col=None
)

In [32]:
model = LogisticRegression(multi_class='multinomial', max_iter=1000)
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)

In [34]:
# Model performance
print(classification_report(y_test, y_pred, target_names=iris.target_names))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [36]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
