In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
import pandas as pd
from collections import Counter
from scipy.sparse import csr_matrix
import math
from sklearn.preprocessing import normalize
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC



In [3]:
df = pd.read_csv('/content/drive/MyDrive/ML DATA/finalized_data.csv')

In [4]:
class TFIDFVectorizer:
    def __init__(self):
        self.vocab = {}
        self.idfs = {}

    def _idf(self, corpus, unique_words):
        idf_vals = {}
        total_docs = len(corpus)
        for word in tqdm(unique_words):
            cnt = 0
            for row in corpus:
                if word in row.split(" "):
                    cnt += 1
            idf_vals[word] = 1 + math.log((1 + total_docs) / (1 + cnt))
        return idf_vals

    def fit(self, dataset):
        if isinstance(dataset, (list,)):
            unique_words = set()
            for row in tqdm(dataset):
                for word in row.split(" "):
                    if len(word) < 2:
                        continue
                    unique_words.add(word)
            unique_words = sorted(list(unique_words))
            self.vocab = {j: i for i, j in enumerate(unique_words)}
            self.idfs = self._idf(dataset, unique_words)
        return self

    def transform(self, dataset):
        rows, cols, data = [], [], []
        for idx, row in tqdm(enumerate(dataset), total = len(dataset)):
            word_count = Counter(row.split(' '))
            for word, count in word_count.items():
                if word in self.vocab:
                    tf = count / len(row.split(' '))
                    tfidf = tf * self.idfs[word]
                    rows.append(idx)
                    cols.append(self.vocab[word])
                    data.append(tfidf)
        sparse_matrix = csr_matrix((data, (rows, cols)), shape=(len(dataset), len(self.vocab)))
        return normalize(sparse_matrix, norm='l2', axis=1, copy=True)

    def fit_transform(self, dataset):
        self.fit(dataset)
        return self.transform(dataset)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12251 entries, 0 to 12250
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     12251 non-null  int64 
 1   Category       12251 non-null  object
 2   Category Area  12251 non-null  object
 3   Text           12251 non-null  object
dtypes: int64(1), object(3)
memory usage: 383.0+ KB


# სამწუხაროდ ჩემი დაწერილი TF-IDF დიდ კორპუსზე ძალიან ნელა მუშაობს, ამიტომ მიწევს sklearn-ის გამოყენება :(

In [38]:
tfidf = TfidfVectorizer( norm=None)
tfidf.fit(df["Text"])
weights = tfidf.transform(df["Text"])


In [9]:
le = LabelEncoder()

le.fit(df["Category"])

df["Category"] = le.transform(df["Category"])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(weights, df["Category"], test_size=0.1, random_state=42)

In [11]:
df["Category"].unique()


array([ 0,  1,  2,  3,  4,  5,  7,  8, 10, 11,  9, 12, 13, 14, 15, 16, 17,
       18, 19, 21, 22, 24, 25, 23, 26, 27, 30, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 45, 43, 44, 46, 47, 28, 31,  6, 20, 29])

ახლა, ამ სეტაპით მინდა განვიხილო რამდენიმე მოდელი:

1.   **KNN**

2.   **logistic regression**

3.   **SVM**

4.   **Random Forest Regression**



In [15]:
weights[0].shape

(1, 118730)

In [12]:
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


In [13]:
accuracy

0.36786296900489396

ერთ-ერთი რამ, რაც უნდა გვექნა აქამდე და არ ვქენით, არის weight-ების dimension-ის ცვლილება. ერთ-ერთი ამის ვარიანტი არის SVD ფაქტორიზაცია.

In [39]:
svd = TruncatedSVD(n_components=1200, random_state=42)
reduced_weights = svd.fit_transform(weights)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(reduced_weights, df["Category"], test_size=0.1, random_state=42)

0.6639477977161501

In [27]:
models = {
    'KNN': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=50),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC()

}
accuracy_scores = {}
# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[model_name] = accuracy

    # Print the accuracy
    print(f'Accuracy of {model_name} on test set: {accuracy:.2f}')


Accuracy of KNN on test set: 0.66


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy of LogisticRegression on test set: 0.77
Accuracy of RandomForest on test set: 0.72
Accuracy of SVM on test set: 0.74


ეს სივი არის დაგენერირებული ჯიპიტის მიერ. არანაირი ინსტრუქცია, არანაირი დატის წმენდა არ ჩამიტარებია, არაფერი არ მითქვამს გარდა იმისა, რომ ფიტნეს ინსტრუქტორის სივი დაეგენერირებინა.

In [40]:
cv_text = """John Doe
123 Fitness Ave
Gym City, GA 30303
(555) 123-4567
johndoe@example.com
LinkedIn: linkedin.com/in/johndoe-fitness
Objective
Dynamic and certified Health and Fitness Instructor with over 5 years of experience specializing in creating personalized workout programs and conducting high-energy group fitness classes. Dedicated to helping clients achieve their health and fitness goals through tailored exercise routines and comprehensive nutritional guidance.
Certifications
Certified Personal Trainer, American Council on Exercise (ACE), 2018
Group Fitness Instructor, National Academy of Sports Medicine (NASM), 2019
CPR and First Aid, American Red Cross, 2020
Yoga Instructor, 200-Hour RYT, Yoga Alliance, 2017
Professional Experience
Health and Fitness Instructor
Gold's Gym, Atlanta, GA
June 2020 – Present
Design and implement customized fitness programs for over 100 regular clients, increasing their fitness performance by an average of 25%.
Lead weekly group fitness classes, including Spin, HIIT, and Yoga, with an average attendance of 30 participants per class.
Conduct bi-weekly workshops on nutrition and wellness that have improved client retention by 20%.
Utilize motivational interviewing techniques to encourage client commitment and achieve a 95% success rate in client goal attainment.
Fitness Coach
Anytime Fitness, Atlanta, GA
May 2015 – May 2020
Spearheaded a successful bootcamp program that grew to include 50 regular participants.
Collaborated with physical therapists to design rehabilitation exercises that safely engaged clients in physical activity post-injury.
Increased gym membership by organizing monthly health and wellness fairs that attracted over 200 attendees.
Education
Bachelor of Science in Exercise Science
Georgia State University, Atlanta, GA
August 2011 – May 2015
Skills
Expert in creating tailored fitness programs.
Proficient with digital fitness tracking systems like Fitbit and MyFitnessPal.
Excellent communicator with effective client-facing skills.
Knowledgeable in body mechanics and functional training.
Fluent in Spanish.
Professional Affiliations
Member, National Strength and Conditioning Association (NSCA)
Volunteer, Community Fitness Days, providing free fitness coaching to underprivileged communities
References
Available upon request."""

# Assuming 'vectorizer' is already fitted to a relevant corpus
tfidf_weights = tfidf.transform([cv_text])
reduced_weights = svd.transform(tfidf_weights)

In [43]:
reduced_weights.shape

(1, 1200)

In [47]:
for model_name, model in models.items():
  label = model.predict(reduced_weights)

  decoded_label = le.inverse_transform(label)
  print(f"Decoded label for {model_name}:", decoded_label)

Decoded label for KNN: ['Health and Fitness']
Decoded label for LogisticRegression: ['Health and Fitness']
Decoded label for RandomForest: ['Health and Fitness']
Decoded label for SVM: ['Health and Fitness']


შემდეგ, გავტესტავ ჩემზე და ჩემი მეგობრების სივი-ებზე.