<a href="https://colab.research.google.com/github/Manvi1718/Apache-Log-File-Analysis/blob/main/AI_assignmnet_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI ALGORITHMS

### Required Libraries

In [46]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import confusion_matrix, classification_report
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import r2_score
from gensim.models import Word2Vec
import os
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from urllib.request import urlretrieve

### Device Agnostic Code

In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## 1) Linear Regression

In [48]:
# Linear Regression
def linear_regression():
    # Toy dataset
    X, y = datasets.make_regression(n_samples=1000, n_features=1, noise=0.1)

    # Convert to PyTorch tensors and move to GPU
    X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
    y_tensor = torch.tensor(y, dtype=torch.float32).to(device)

    # Linear Regression model
    model = nn.Linear(1, 1).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    # Training
    for epoch in range(200):
        optimizer.zero_grad()
        outputs = model(X_tensor)
        # Squeeze the output tensor to match the target tensor dimensions
        outputs = outputs.squeeze(dim=1)
        loss = criterion(outputs, y_tensor)
        loss.backward()
        optimizer.step()
        if (epoch+1) % 20== 0:
            print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

            # Calculate R-squared
            with torch.no_grad():
                predicted = model(X_tensor).cpu().numpy()
                r2 = r2_score(y, predicted)
                print(f'R-squared: {r2:.4f}')

In [49]:
# Run linear regression
linear_regression()

Epoch [20/100], Loss: 79.0598
R-squared: 0.5498
Epoch [40/100], Loss: 34.8749
R-squared: 0.8014
Epoch [60/100], Loss: 15.3966
R-squared: 0.9123
Epoch [80/100], Loss: 6.8045
R-squared: 0.9612
Epoch [100/100], Loss: 3.0121
R-squared: 0.9828
Epoch [120/100], Loss: 1.3372
R-squared: 0.9924
Epoch [140/100], Loss: 0.5970
R-squared: 0.9966
Epoch [160/100], Loss: 0.2696
R-squared: 0.9985
Epoch [180/100], Loss: 0.1248
R-squared: 0.9993
Epoch [200/100], Loss: 0.0607
R-squared: 0.9997


Value of R squared close to 1 ,means its a good fit

## 2) Logistic Regression

In [50]:
# Logistic Regression
def logistic_regression(reg_strength):
    # Toy dataset
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Convert to PyTorch tensors and move to GPU
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

    # Logistic Regression model
    model = LogisticRegression(max_iter=100,C=reg_strength).fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)*100
    print(f'Logistic Regression Accuracy: {accuracy:.2f}%')
    print(f'Regularization Strength (C): {reg_strength}')


In [51]:
# Run logistic regression
logistic_regression(0.01)

Logistic Regression Accuracy: 80.00%
Regularization Strength (C): 0.01


Regularisation strength to reduce overfitting in logistic regression. use smaller value of C

## 3) Decision Tree

Methods in Decision tree to solve overfitting isues in Decision Trees

* Pruning - reduce the size of decision by removing its brances, done by `ccp_alpha` , it is the complexity parameter.

* Limiting depth - limiting the maximum depth of the tree , done by setting `max_depth` parameter
* Minimum samples split and leaf -
we can also control the minimum number of samples to split `min_samples_split` and internal node and minimum number of samples required to be at leaf node `min_samples_leaf`.

In [52]:
# Decision Tree
def decision_tree():
    # Toy dataset
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Decision Tree model
    model = DecisionTreeClassifier(ccp_alpha=0.01,max_depth=2)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)*100
    print(f'Decision Tree Accuracy: {accuracy:.2f}%')


In [53]:
decision_tree()

Decision Tree Accuracy: 90.00%


## 4) SVM

In [54]:
def svm():
    # Toy dataset
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # SVM model
    model = SVC(C=0.1)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)*100
    print(f'SVM Accuracy: {accuracy:.2f}%')

In [55]:
svm()

SVM Accuracy: 90.00%


## 5) Naive bayes

In [56]:
def naive_bayes():
    # Toy dataset
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    X, y = newsgroups.data, newsgroups.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Naive-Bayes model
    model = make_pipeline(TfidfVectorizer(), MultinomialNB())
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)*100
    print(f'Naive-Bayes Accuracy: {accuracy:.2f}%')

In [57]:
naive_bayes()

Naive-Bayes Accuracy: 65.78%


## 6) TF-IDF

In [58]:
# TF-IDF
def tfidf():
    # Toy dataset
    newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
    X, y = newsgroups.data, newsgroups.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # TF-IDF model with reduced vocabulary size
    vectorizer = TfidfVectorizer(max_features=1000,ngram_range=(1,2))  # Adjust max_features as needed
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Convert to PyTorch tensors and move to GPU
    X_train_tensor = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32).to(device)
    X_test_tensor = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)

    # Simple Neural Network model with reduced hidden size
    class SimpleNN(nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super(SimpleNN, self).__init__()
            self.fc1 = nn.Linear(input_size, hidden_size)
            self.relu = nn.ReLU()
            self.fc2 = nn.Linear(hidden_size, output_size)

        def forward(self, x):
            out = self.fc1(x)
            out = self.relu(out)
            out = self.fc2(out)
            return out

    # Instantiate the model with reduced hidden size
    model = SimpleNN(input_size=X_train_tensor.shape[1], hidden_size=32, output_size=len(set(y))).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training
    for epoch in range(50):  # Reduced number of epochs
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/50], Loss: {loss.item():.4f}')

    # Testing
    with torch.no_grad():
        outputs = model(X_test_tensor)
        _, predicted = torch.max(outputs, 1)
        accuracy = accuracy_score(y_test, predicted.cpu().numpy())*100

    print(f'TF-IDF Neural Network Accuracy: {accuracy:.2f}%')

In [59]:
tfidf()

Epoch [10/50], Loss: 2.9897
Epoch [20/50], Loss: 2.9661
Epoch [30/50], Loss: 2.9358
Epoch [40/50], Loss: 2.8986
Epoch [50/50], Loss: 2.8541
TF-IDF Neural Network Accuracy: 24.69%


## 7) Word2Vec

In [60]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
# Word2Vec
def word2vec():
    # Download the 'punkt' resource
    import nltk
    nltk.download('punkt')

    # Toy dataset
    corpus = [
        "This is the first document.",
        "This document is the second document.",
        "And this is the third one.",
        "Is this the first document?",
    ]

    # Tokenize and train Word2Vec model
    tokenized_corpus = [nltk.word_tokenize(sentence.lower()) for sentence in corpus]
    word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

    # Example: Finding the vector for a word
    vector = word2vec_model.wv['document']
    print("Word2Vec Vector for 'document':")
    print(vector)

In [62]:
word2vec()

Word2Vec Vector for 'document':
[-5.3761393e-04  2.3459077e-04  5.1012170e-03  9.0115219e-03
 -9.3035055e-03 -7.1186870e-03  6.4577162e-03  8.9744031e-03
 -5.0161965e-03 -3.7644049e-03  7.3809391e-03 -1.5342169e-03
 -4.5370674e-03  6.5543531e-03 -4.8609949e-03 -1.8136933e-03
  2.8776617e-03  9.8915887e-04 -8.2834894e-03 -9.4506554e-03
  7.3119737e-03  5.0714435e-03  6.7562792e-03  7.6230383e-04
  6.3530928e-03 -3.4065295e-03 -9.4848091e-04  5.7711215e-03
 -7.5222286e-03 -3.9373739e-03 -7.5092558e-03 -9.2885981e-04
  9.5392875e-03 -7.3166536e-03 -2.3360765e-03 -1.9363161e-03
  8.0779977e-03 -5.9297686e-03  4.5617318e-05 -4.7524953e-03
 -9.6023204e-03  5.0089518e-03 -8.7604597e-03 -4.3930719e-03
 -3.5214103e-05 -2.9548592e-04 -7.6621324e-03  9.6163880e-03
  4.9832016e-03  9.2352722e-03 -8.1572160e-03  4.4980138e-03
 -4.1374546e-03  8.2675234e-04  8.4996261e-03 -4.4643688e-03
  4.5164214e-03 -6.7876368e-03 -3.5471660e-03  9.3982853e-03
 -1.5784105e-03  3.2352045e-04 -4.1381051e-03 -7.6831

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 8) GloVe

In [63]:
def download_glove_model(output_file):
    glove_url = 'https://nlp.stanford.edu/data/glove.6B.zip'
    glove_zip_file = 'glove.6B.zip'

    # Download GloVe zip file
    urlretrieve(glove_url, glove_zip_file)

    # Unzip GloVe file
    from zipfile import ZipFile
    with ZipFile(glove_zip_file, 'r') as zip_ref:
        zip_ref.extractall()

    # Convert GloVe format to Word2Vec format
    glove_txt_file = 'glove.6B.100d.txt'
    glove2word2vec(glove_txt_file, output_file)

    # Clean up - remove downloaded zip file
    os.remove(glove_zip_file)

    return output_file

In [64]:
def glove():
    # Define the output file for the converted GloVe model
    word2vec_output_file = 'glove.6B.100d.word2vec'

    # Download and convert GloVe model if not already downloaded
    if not os.path.exists(word2vec_output_file):
        download_glove_model(word2vec_output_file)

    # Load GloVe embeddings using Gensim
    glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

    # Example: Finding the vector for a word
    vector = glove_model['document']
    print("Glove Vector for 'document':")
    print(vector)

In [65]:
glove()

Glove Vector for 'document':
[-2.7285e-01 -9.6449e-02  4.1131e-01  3.7925e-01  8.9352e-01  4.5227e-01
  1.9478e-01 -3.6985e-01  5.9704e-01  1.3387e-01  4.2878e-01 -2.8012e-01
  2.0141e-01 -1.9995e-02 -6.2983e-02  7.1399e-01  8.9025e-01 -3.1009e-01
 -1.9911e-01 -4.6591e-01 -8.8145e-01 -5.4318e-01 -5.2839e-01  7.0794e-02
 -3.1042e-01 -9.8677e-01  1.0283e-01  1.6911e-01 -4.4878e-01  1.6171e-01
  3.9394e-01  1.2655e-01 -1.2540e-01 -6.6462e-02 -1.2977e-01 -3.9406e-02
  4.4811e-02 -4.2534e-01  2.6742e-02 -3.8609e-01 -8.4547e-01 -6.4412e-02
  6.8974e-01  2.4521e-01 -7.3434e-01 -7.7389e-01 -1.5336e-01 -2.9057e-01
 -6.8358e-01 -3.8785e-01  1.2230e+00  1.7723e-01  1.6004e-01  8.3723e-01
 -3.1238e-01 -1.3138e+00 -2.6000e-01 -4.8754e-01  1.6751e+00  1.7320e-01
 -2.9494e-01  1.6038e-01 -5.3087e-01 -9.0950e-01  6.7436e-01 -5.2625e-01
 -3.0406e-01  8.5552e-01 -2.6879e-01 -9.0492e-01  3.0380e-01  2.0591e-01
  3.3439e-01 -6.2308e-01  6.4306e-02  2.2179e-01 -9.2076e-02  2.1894e-01
 -1.4015e+00 -4.4588e-

## 9) Lemmatization

In [66]:
# Lemmatization - NLP
def lemmatization():
    import nltk

    # Download WordNet
    nltk.download('wordnet')

    # Toy dataset
    corpus = [
        "This is the first document.",
        "This document is the second document.",
        "And this is the third one.",
        "Is this the first document?",
    ]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_corpus = [' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(sentence)]) for sentence in corpus]

    print("Lemmatized Corpus:")
    print(lemmatized_corpus)

In [67]:
lemmatization()

Lemmatized Corpus:
['This is the first document .', 'This document is the second document .', 'And this is the third one .', 'Is this the first document ?']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 10) Recurrent Neural Network (RNN)

In [68]:
# Toy dataset
sequences = [
    "this is good",
    "that is bad",
    "this is excellent",
    "that is horrible"
]
labels = [1, 0, 1, 0]

# Tokenize and convert to PyTorch tensors
tokenized_sequences = [nltk.word_tokenize(sentence.lower()) for sentence in sequences]
word2vec_model = Word2Vec(sentences=tokenized_sequences, vector_size=100, window=5, min_count=1, workers=4)
X_rnn = torch.tensor([word2vec_model.wv[s] for s in tokenized_sequences], dtype=torch.float32).to(device)
y_rnn = torch.tensor(labels, dtype=torch.float32).to(device)


In [69]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import movie_reviews
import random

# Set a random seed for reproducibility
random.seed(42)

# Download the movie reviews dataset from nltk
nltk.download('movie_reviews')

# Load the movie reviews and labels
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents to mix positive and negative reviews
random.shuffle(documents)

# Separate the reviews and labels
reviews = [" ".join(words) for words, label in documents]
labels = [1 if label == 'pos' else 0 for words, label in documents]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

# Tokenize and convert to PyTorch tensors
def tokenize_and_vectorize(data, max_length=200):
    tokenized_data = [word_tokenize(sentence.lower()) for sentence in data]
    vectorized_data = torch.zeros(len(tokenized_data), max_length, 100)

    for i, sentence in enumerate(tokenized_data):
        for j, word in enumerate(sentence):
            if j == max_length:
                break
            if word in word2vec_model.wv:
                vectorized_data[i, j, :] = torch.tensor(word2vec_model.wv[word])

    return vectorized_data

X_train_tensor = tokenize_and_vectorize(X_train)
X_test_tensor = tokenize_and_vectorize(X_test)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Simple RNN model with non-linearity
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.activation = nn.Tanh()  # Tanh activation
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.activation(out)  #Tanh activation
        out = self.fc(out[:, -1, :])
        return out

# Instantiate the RNN model
rnn_model = SimpleRNN(input_size=100, hidden_size=64, output_size=1)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)

# Training
for epoch in range(10):
    optimizer.zero_grad()
    outputs = rnn_model(X_train_tensor)
    loss = criterion(outputs.squeeze(), y_train_tensor)
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        rnn_model.eval()
        predictions = (torch.sigmoid(rnn_model(X_test_tensor)) > 0.5).float()
        accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predictions.cpu().numpy()) * 100
        print(f'Epoch [{epoch + 1}/10], Loss: {loss.item():.4f}, Accuracy: {accuracy:.2f}%')
        rnn_model.train()  # Set back to training mode


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Epoch [1/10], Loss: 0.6932, Accuracy: 50.25%
Epoch [2/10], Loss: 0.6933, Accuracy: 50.25%
Epoch [3/10], Loss: 0.6931, Accuracy: 49.75%
Epoch [4/10], Loss: 0.6931, Accuracy: 49.75%
Epoch [5/10], Loss: 0.6932, Accuracy: 49.75%
Epoch [6/10], Loss: 0.6931, Accuracy: 50.25%
Epoch [7/10], Loss: 0.6931, Accuracy: 49.50%
Epoch [8/10], Loss: 0.6931, Accuracy: 50.25%
Epoch [9/10], Loss: 0.6931, Accuracy: 50.25%
Epoch [10/10], Loss: 0.6931, Accuracy: 50.25%
