In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.sparse import hstack, csr_matrix

In [2]:
# === Шаг 1. Загрузка данных ===
texts_df = pd.read_csv("cefr_leveled_texts.csv").rename(columns={"label": "level"}).dropna()
grammar_df = pd.read_csv("grammar_features.csv")

In [3]:
# === Шаг 2. SBERT embeddings ===
model_st = SentenceTransformer('paraphrase-MiniLM-L6-v2')
X_sbert = model_st.encode(texts_df['text'].tolist(), show_progress_bar=False)

In [4]:
# === Шаг 3. TF-IDF ===
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1, 2), stop_words='english')
X_tfidf = vectorizer.fit_transform(texts_df['text'])

In [5]:
# === Шаг 4. Grammar features ===
X_grammar = grammar_df[['passive_count', 'modal_count', 'conditional_type_2', 'avg_sentence_len']]
X_grammar_scaled = StandardScaler().fit_transform(X_grammar)

In [6]:
# === Шаг 5. Объединение ===
from scipy.sparse import hstack
X_combined = hstack([
    csr_matrix(X_sbert),         # преобразуем SBERT в sparse
    X_tfidf,                     # TF-IDF уже sparse
    csr_matrix(X_grammar_scaled) # грамматика — sparse
])
y = texts_df['level']

In [7]:
# === Шаг 6. Обучение и оценка ===
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y, random_state=42)

model_combined = LogisticRegression(max_iter=1000)
model_combined.fit(X_train, y_train)
y_pred = model_combined.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          A1       0.75      0.86      0.80        58
          A2       0.76      0.58      0.66        55
          B1       0.45      0.37      0.41        41
          B2       0.39      0.44      0.41        57
          C1       0.44      0.50      0.47        48
          C2       0.67      0.65      0.66        40

    accuracy                           0.58       299
   macro avg       0.58      0.57      0.57       299
weighted avg       0.58      0.58      0.57       299



In [10]:
def extract_grammar_features(text):
    import re
    text = str(text).lower()
    words = re.findall(r'\b[a-z]+\b', text)
    total_words = len(words)
    total_sentences = max(1, text.count('.'))

    return [
        len(re.findall(r'\b(be|is|are|was|were|been|being)\s+\w+ed\b', text)) / total_sentences,
        len(re.findall(r'\b(would|could|should|might|may|can|must|shall|will)\b', text)) / total_words if total_words else 0,
        len(re.findall(r'\bif\s+\w+\s+(had|was|were)\b.*?\bwould\b', text)),
        total_words / total_sentences
    ]

def predict_level_combined(text):
    emb = model_st.encode([text])
    tfidf_vec = vectorizer.transform([text])
    grammar_vec = StandardScaler().fit(X_grammar).transform([extract_grammar_features(text)])
    
    full_vector = hstack([
        csr_matrix(emb),
        tfidf_vec,
        csr_matrix(grammar_vec)
    ])
    
    return model_combined.predict(full_vector)[0]


In [12]:
# Ввод текста от пользователя
user_input = input("Enter your text in English: ")

# Предсказание уровня
predicted = predict_level_combined(user_input)

# Вывод результата
print("Predicted CEFR level:", predicted)

Enter your text in English:  Hi there!My name is Karina. I'm fond of machine learning and find really exciting math! I have been learning math since 20 years old and now i'm totally proud of myself for being so confident in math's problems.


Predicted CEFR level: A2


