In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report , accuracy_score

In [2]:
data=pd.read_csv("tennis.csv.txt")

In [3]:
#Converting .txt to .csv type
data.to_csv("tennis_time.csv",index=False)

In [4]:
data

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


###  Exploratory Data Analysis (EDA)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   outlook   14 non-null     object
 1   temp      14 non-null     object
 2   humidity  14 non-null     object
 3   windy     14 non-null     bool  
 4   play      14 non-null     object
dtypes: bool(1), object(4)
memory usage: 590.0+ bytes


In [6]:
data.isnull().sum()

outlook     0
temp        0
humidity    0
windy       0
play        0
dtype: int64

In [7]:
data.shape

(14, 5)

In [8]:
data=data.drop_duplicates()

In [9]:
data.shape

(14, 5)

### 1.Convert the given dataset into frequency tables.

In [10]:
# List of categorical features
categorical_features = ["outlook", "temp", "humidity", "windy"]

# Calculate frequencies for each feature and outcome
frequency_tables = {}
for feature in categorical_features:
    frequency_table = pd.crosstab(data[feature], data["play"], margins=True, margins_name="Total")
    frequency_tables[feature] = frequency_table
    
    # Display the calculated frequency table
    print(f"\nFrequency table for {feature.capitalize()}:")
    print(frequency_table)


Frequency table for Outlook:
play      no  yes  Total
outlook                 
overcast   0    4      4
rainy      2    3      5
sunny      3    2      5
Total      5    9     14

Frequency table for Temp:
play   no  yes  Total
temp                 
cool    1    3      4
hot     2    2      4
mild    2    4      6
Total   5    9     14

Frequency table for Humidity:
play      no  yes  Total
humidity                
high       4    3      7
normal     1    6      7
Total      5    9     14

Frequency table for Windy:
play   no  yes  Total
windy                
False   2    6      8
True    3    3      6
Total   5    9     14


### 2.Generate a Likelihood table by finding the probabilities of given features.

In [11]:
# Calculate probabilities by dividing the "Total" column by the total count
total_count = len(data)
likelihood_tables = {}
for feature, frequency_table in frequency_tables.items():
    likelihood_table = frequency_table.copy()
    likelihood_table["Total"] = likelihood_table["Total"] / total_count
    likelihood_tables[feature] = likelihood_table
    
    # Display the calculated likelihood table
    print(f"\nLikelihood table for {feature.capitalize()}:")
    print(likelihood_table)



Likelihood table for Outlook:
play      no  yes     Total
outlook                    
overcast   0    4  0.285714
rainy      2    3  0.357143
sunny      3    2  0.357143
Total      5    9  1.000000

Likelihood table for Temp:
play   no  yes     Total
temp                    
cool    1    3  0.285714
hot     2    2  0.285714
mild    2    4  0.428571
Total   5    9  1.000000

Likelihood table for Humidity:
play      no  yes  Total
humidity                
high       4    3    0.5
normal     1    6    0.5
Total      5    9    1.0

Likelihood table for Windy:
play   no  yes     Total
windy                   
False   2    6  0.571429
True    3    3  0.428571
Total   5    9  1.000000


### 3.Now, use the Bayes theorem to calculate the posterior probability.


In [12]:
# Calculate posterior probabilities for the entire dataset
posterior_probabilities = {}
for index, row in data.iterrows():
    posterior_probabilities[index] = {}
    
     # Calculate conditional probabilities for both classes
    for outcome in ["no", "yes"]:
        conditional_prob = 1
        for feature in categorical_features:
            likelihood_value = likelihood_tables[feature].loc[row[feature], outcome]
            conditional_prob *= likelihood_value
        posterior_probabilities[index][outcome] = conditional_prob
    
    # Normalize probabilities to make them sum up to 1
    total_prob = sum(posterior_probabilities[index].values())
    for outcome in ["no", "yes"]:
        posterior_probabilities[index][outcome] /= total_prob

    

In [13]:
# Display the calculated posterior probabilities
print("\nCalculated Posterior Probabilities:")
for index, probs in posterior_probabilities.items():
    print(f"Index: {index}")
    for outcome, prob in probs.items():
        print(f"P({outcome} | {' ,'.join(data.columns)}): {prob:.2f}")



Calculated Posterior Probabilities:
Index: 0
P(no | outlook ,temp ,humidity ,windy ,play): 0.40
P(yes | outlook ,temp ,humidity ,windy ,play): 0.60
Index: 1
P(no | outlook ,temp ,humidity ,windy ,play): 0.67
P(yes | outlook ,temp ,humidity ,windy ,play): 0.33
Index: 2
P(no | outlook ,temp ,humidity ,windy ,play): 0.00
P(yes | outlook ,temp ,humidity ,windy ,play): 1.00
Index: 3
P(no | outlook ,temp ,humidity ,windy ,play): 0.13
P(yes | outlook ,temp ,humidity ,windy ,play): 0.87
Index: 4
P(no | outlook ,temp ,humidity ,windy ,play): 0.01
P(yes | outlook ,temp ,humidity ,windy ,play): 0.99
Index: 5
P(no | outlook ,temp ,humidity ,windy ,play): 0.04
P(yes | outlook ,temp ,humidity ,windy ,play): 0.96
Index: 6
P(no | outlook ,temp ,humidity ,windy ,play): 0.00
P(yes | outlook ,temp ,humidity ,windy ,play): 1.00
Index: 7
P(no | outlook ,temp ,humidity ,windy ,play): 0.25
P(yes | outlook ,temp ,humidity ,windy ,play): 0.75
Index: 8
P(no | outlook ,temp ,humidity ,windy ,play): 0.03
P(yes |

In [14]:
# List of categorical features
categorical_features = ["outlook", "temp", "humidity", "windy"]

# Prepare features (X) and target variable (y)
X = data[categorical_features]
y = data["play"]



In [15]:
# Convert categorical features to the "category" dtype
for feature in categorical_features:
    X[feature] = X[feature].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [16]:
# Convert categorical features to numerical labels
X_encoded = X.copy()
for feature in categorical_features:
    X_encoded[feature] = X_encoded[feature].cat.codes


In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)



In [18]:
# Create and train the Naive Bayes model
model = CategoricalNB()
model.fit(X_train, y_train)

In [19]:
# Define different sets of input features for the predictions
input_features_list = [
    ["sunny", "mild", "high", "False"],
    ["rainy", "hot", "normal", "True"],
    ["overcast", "cool", "high", "False"]
    # Add more sets of input features as needed
]


In [20]:
# Convert the input features into numerical labels
input_features_encoded = []
for input_features in input_features_list:
    input_features_encoded.append([X_encoded[feature].cat.categories.get_loc(value) for feature, value in zip(categorical_features, input_features)])


AttributeError: Can only use .cat accessor with a 'category' dtype

In [23]:
# Convert categorical features to numerical labels
X_encoded = X.copy()
for feature in categorical_features:
    X_encoded[feature] = X_encoded[feature].astype("category").cat.codes


In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [21]:
# Convert the input features into numerical labels
input_features_encoded = []
for input_features in input_features_list:
    input_features_encoded.append([X_encoded[feature].cat.categories.get_loc(value) for feature, value in zip(categorical_features, input_features)])

AttributeError: Can only use .cat accessor with a 'category' dtype

In [None]:






# Predict the probabilities for each set of input features
probs = model.predict_proba(input_features_encoded)

# Interpret the probabilities for each set of input features
for i, input_features in enumerate(input_features_list):
    play_probability = probs[i][1]  # Probability of playing (class "yes")
    print(f"Probability of playing with features {input_features}: {play_probability:.2f}")
